Linux 4.19.133
[linux/fpc-iii.git] / net / ipv6 / ip6_output.c
blob22665e3638ac4486139f2a65cb5d82352d22e63e
1 /*
2 * IPv6 output functions
3 * Linux INET6 implementation
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
46 #include <net/sock.h>
47 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
64 struct dst_entry *dst = skb_dst(skb);
65 struct net_device *dev = dst->dev;
66 struct neighbour *neigh;
67 struct in6_addr *nexthop;
68 int ret;
70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 ((mroute6_is_socket(net, skb) &&
75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 &ipv6_hdr(skb)->saddr))) {
78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
80 /* Do not check for IFF_ALLMULTI; multicast routing
81 is not supported in any case.
83 if (newskb)
84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 net, sk, newskb, NULL, newskb->dev,
86 dev_loopback_xmit);
88 if (ipv6_hdr(skb)->hop_limit == 0) {
89 IP6_INC_STATS(net, idev,
90 IPSTATS_MIB_OUTDISCARDS);
91 kfree_skb(skb);
92 return 0;
96 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
98 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 IPV6_ADDR_SCOPE_NODELOCAL &&
100 !(dev->flags & IFF_LOOPBACK)) {
101 kfree_skb(skb);
102 return 0;
106 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 int res = lwtunnel_xmit(skb);
109 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 return res;
113 rcu_read_lock_bh();
114 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 if (unlikely(!neigh))
117 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 if (!IS_ERR(neigh)) {
119 sock_confirm_neigh(skb, neigh);
120 ret = neigh_output(neigh, skb);
121 rcu_read_unlock_bh();
122 return ret;
124 rcu_read_unlock_bh();
126 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 kfree_skb(skb);
128 return -EINVAL;
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
133 int ret;
135 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 if (ret) {
137 kfree_skb(skb);
138 return ret;
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 /* Policy lookup after SNAT yielded a new policy */
143 if (skb_dst(skb)->xfrm) {
144 IPCB(skb)->flags |= IPSKB_REROUTED;
145 return dst_output(net, sk, skb);
147 #endif
149 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 dst_allfrag(skb_dst(skb)) ||
151 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 else
154 return ip6_finish_output2(net, sk, skb);
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
159 struct net_device *dev = skb_dst(skb)->dev;
160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
162 skb->protocol = htons(ETH_P_IPV6);
163 skb->dev = dev;
165 if (unlikely(idev->cnf.disable_ipv6)) {
166 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 kfree_skb(skb);
168 return 0;
171 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 net, sk, skb, NULL, dev,
173 ip6_finish_output,
174 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
179 if (!np->autoflowlabel_set)
180 return ip6_default_np_autolabel(net);
181 else
182 return np->autoflowlabel;
186 * xmit an sk_buff (used by TCP, SCTP and DCCP)
187 * Note : socket lock is not held for SYNACK packets, but might be modified
188 * by calls to skb_set_owner_w() and ipv6_local_error(),
189 * which are using proper atomic operations or spinlocks.
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 __u32 mark, struct ipv6_txoptions *opt, int tclass)
194 struct net *net = sock_net(sk);
195 const struct ipv6_pinfo *np = inet6_sk(sk);
196 struct in6_addr *first_hop = &fl6->daddr;
197 struct dst_entry *dst = skb_dst(skb);
198 unsigned int head_room;
199 struct ipv6hdr *hdr;
200 u8 proto = fl6->flowi6_proto;
201 int seg_len = skb->len;
202 int hlimit = -1;
203 u32 mtu;
205 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
206 if (opt)
207 head_room += opt->opt_nflen + opt->opt_flen;
209 if (unlikely(skb_headroom(skb) < head_room)) {
210 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211 if (!skb2) {
212 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213 IPSTATS_MIB_OUTDISCARDS);
214 kfree_skb(skb);
215 return -ENOBUFS;
217 if (skb->sk)
218 skb_set_owner_w(skb2, skb->sk);
219 consume_skb(skb);
220 skb = skb2;
223 if (opt) {
224 seg_len += opt->opt_nflen + opt->opt_flen;
226 if (opt->opt_flen)
227 ipv6_push_frag_opts(skb, opt, &proto);
229 if (opt->opt_nflen)
230 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
231 &fl6->saddr);
234 skb_push(skb, sizeof(struct ipv6hdr));
235 skb_reset_network_header(skb);
236 hdr = ipv6_hdr(skb);
239 * Fill in the IPv6 header
241 if (np)
242 hlimit = np->hop_limit;
243 if (hlimit < 0)
244 hlimit = ip6_dst_hoplimit(dst);
246 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
247 ip6_autoflowlabel(net, np), fl6));
249 hdr->payload_len = htons(seg_len);
250 hdr->nexthdr = proto;
251 hdr->hop_limit = hlimit;
253 hdr->saddr = fl6->saddr;
254 hdr->daddr = *first_hop;
256 skb->protocol = htons(ETH_P_IPV6);
257 skb->priority = sk->sk_priority;
258 skb->mark = mark;
260 mtu = dst_mtu(dst);
261 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
262 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263 IPSTATS_MIB_OUT, skb->len);
265 /* if egress device is enslaved to an L3 master device pass the
266 * skb to its handler for processing
268 skb = l3mdev_ip6_out((struct sock *)sk, skb);
269 if (unlikely(!skb))
270 return 0;
272 /* hooks should never assume socket lock is held.
273 * we promote our socket to non const
275 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
276 net, (struct sock *)sk, skb, NULL, dst->dev,
277 dst_output);
280 skb->dev = dst->dev;
281 /* ipv6_local_error() does not require socket lock,
282 * we promote our socket to non const
284 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
286 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
287 kfree_skb(skb);
288 return -EMSGSIZE;
290 EXPORT_SYMBOL(ip6_xmit);
292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
294 struct ip6_ra_chain *ra;
295 struct sock *last = NULL;
297 read_lock(&ip6_ra_lock);
298 for (ra = ip6_ra_chain; ra; ra = ra->next) {
299 struct sock *sk = ra->sk;
300 if (sk && ra->sel == sel &&
301 (!sk->sk_bound_dev_if ||
302 sk->sk_bound_dev_if == skb->dev->ifindex)) {
303 if (last) {
304 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
305 if (skb2)
306 rawv6_rcv(last, skb2);
308 last = sk;
312 if (last) {
313 rawv6_rcv(last, skb);
314 read_unlock(&ip6_ra_lock);
315 return 1;
317 read_unlock(&ip6_ra_lock);
318 return 0;
321 static int ip6_forward_proxy_check(struct sk_buff *skb)
323 struct ipv6hdr *hdr = ipv6_hdr(skb);
324 u8 nexthdr = hdr->nexthdr;
325 __be16 frag_off;
326 int offset;
328 if (ipv6_ext_hdr(nexthdr)) {
329 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
330 if (offset < 0)
331 return 0;
332 } else
333 offset = sizeof(struct ipv6hdr);
335 if (nexthdr == IPPROTO_ICMPV6) {
336 struct icmp6hdr *icmp6;
338 if (!pskb_may_pull(skb, (skb_network_header(skb) +
339 offset + 1 - skb->data)))
340 return 0;
342 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
344 switch (icmp6->icmp6_type) {
345 case NDISC_ROUTER_SOLICITATION:
346 case NDISC_ROUTER_ADVERTISEMENT:
347 case NDISC_NEIGHBOUR_SOLICITATION:
348 case NDISC_NEIGHBOUR_ADVERTISEMENT:
349 case NDISC_REDIRECT:
350 /* For reaction involving unicast neighbor discovery
351 * message destined to the proxied address, pass it to
352 * input function.
354 return 1;
355 default:
356 break;
361 * The proxying router can't forward traffic sent to a link-local
362 * address, so signal the sender and discard the packet. This
363 * behavior is clarified by the MIPv6 specification.
365 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
366 dst_link_failure(skb);
367 return -1;
370 return 0;
373 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
374 struct sk_buff *skb)
376 struct dst_entry *dst = skb_dst(skb);
378 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
379 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
381 skb->tstamp = 0;
382 return dst_output(net, sk, skb);
385 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
387 if (skb->len <= mtu)
388 return false;
390 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
391 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
392 return true;
394 if (skb->ignore_df)
395 return false;
397 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
398 return false;
400 return true;
403 int ip6_forward(struct sk_buff *skb)
405 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
406 struct dst_entry *dst = skb_dst(skb);
407 struct ipv6hdr *hdr = ipv6_hdr(skb);
408 struct inet6_skb_parm *opt = IP6CB(skb);
409 struct net *net = dev_net(dst->dev);
410 u32 mtu;
412 if (net->ipv6.devconf_all->forwarding == 0)
413 goto error;
415 if (skb->pkt_type != PACKET_HOST)
416 goto drop;
418 if (unlikely(skb->sk))
419 goto drop;
421 if (skb_warn_if_lro(skb))
422 goto drop;
424 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
425 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
426 goto drop;
429 skb_forward_csum(skb);
432 * We DO NOT make any processing on
433 * RA packets, pushing them to user level AS IS
434 * without ane WARRANTY that application will be able
435 * to interpret them. The reason is that we
436 * cannot make anything clever here.
438 * We are not end-node, so that if packet contains
439 * AH/ESP, we cannot make anything.
440 * Defragmentation also would be mistake, RA packets
441 * cannot be fragmented, because there is no warranty
442 * that different fragments will go along one path. --ANK
444 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
445 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
446 return 0;
450 * check and decrement ttl
452 if (hdr->hop_limit <= 1) {
453 /* Force OUTPUT device used as source address */
454 skb->dev = dst->dev;
455 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
456 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
458 kfree_skb(skb);
459 return -ETIMEDOUT;
462 /* XXX: idev->cnf.proxy_ndp? */
463 if (net->ipv6.devconf_all->proxy_ndp &&
464 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
465 int proxied = ip6_forward_proxy_check(skb);
466 if (proxied > 0)
467 return ip6_input(skb);
468 else if (proxied < 0) {
469 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
470 goto drop;
474 if (!xfrm6_route_forward(skb)) {
475 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
476 goto drop;
478 dst = skb_dst(skb);
480 /* IPv6 specs say nothing about it, but it is clear that we cannot
481 send redirects to source routed frames.
482 We don't send redirects to frames decapsulated from IPsec.
484 if (IP6CB(skb)->iif == dst->dev->ifindex &&
485 opt->srcrt == 0 && !skb_sec_path(skb)) {
486 struct in6_addr *target = NULL;
487 struct inet_peer *peer;
488 struct rt6_info *rt;
491 * incoming and outgoing devices are the same
492 * send a redirect.
495 rt = (struct rt6_info *) dst;
496 if (rt->rt6i_flags & RTF_GATEWAY)
497 target = &rt->rt6i_gateway;
498 else
499 target = &hdr->daddr;
501 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
503 /* Limit redirects both by destination (here)
504 and by source (inside ndisc_send_redirect)
506 if (inet_peer_xrlim_allow(peer, 1*HZ))
507 ndisc_send_redirect(skb, target);
508 if (peer)
509 inet_putpeer(peer);
510 } else {
511 int addrtype = ipv6_addr_type(&hdr->saddr);
513 /* This check is security critical. */
514 if (addrtype == IPV6_ADDR_ANY ||
515 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
516 goto error;
517 if (addrtype & IPV6_ADDR_LINKLOCAL) {
518 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
519 ICMPV6_NOT_NEIGHBOUR, 0);
520 goto error;
524 mtu = ip6_dst_mtu_forward(dst);
525 if (mtu < IPV6_MIN_MTU)
526 mtu = IPV6_MIN_MTU;
528 if (ip6_pkt_too_big(skb, mtu)) {
529 /* Again, force OUTPUT device used as source address */
530 skb->dev = dst->dev;
531 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
532 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
533 __IP6_INC_STATS(net, ip6_dst_idev(dst),
534 IPSTATS_MIB_FRAGFAILS);
535 kfree_skb(skb);
536 return -EMSGSIZE;
539 if (skb_cow(skb, dst->dev->hard_header_len)) {
540 __IP6_INC_STATS(net, ip6_dst_idev(dst),
541 IPSTATS_MIB_OUTDISCARDS);
542 goto drop;
545 hdr = ipv6_hdr(skb);
547 /* Mangling hops number delayed to point after skb COW */
549 hdr->hop_limit--;
551 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
552 net, NULL, skb, skb->dev, dst->dev,
553 ip6_forward_finish);
555 error:
556 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
557 drop:
558 kfree_skb(skb);
559 return -EINVAL;
562 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
564 to->pkt_type = from->pkt_type;
565 to->priority = from->priority;
566 to->protocol = from->protocol;
567 skb_dst_drop(to);
568 skb_dst_set(to, dst_clone(skb_dst(from)));
569 to->dev = from->dev;
570 to->mark = from->mark;
572 skb_copy_hash(to, from);
574 #ifdef CONFIG_NET_SCHED
575 to->tc_index = from->tc_index;
576 #endif
577 nf_copy(to, from);
578 skb_copy_secmark(to, from);
581 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
582 int (*output)(struct net *, struct sock *, struct sk_buff *))
584 struct sk_buff *frag;
585 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
586 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
587 inet6_sk(skb->sk) : NULL;
588 struct ipv6hdr *tmp_hdr;
589 struct frag_hdr *fh;
590 unsigned int mtu, hlen, left, len, nexthdr_offset;
591 int hroom, troom;
592 __be32 frag_id;
593 int ptr, offset = 0, err = 0;
594 u8 *prevhdr, nexthdr = 0;
596 err = ip6_find_1stfragopt(skb, &prevhdr);
597 if (err < 0)
598 goto fail;
599 hlen = err;
600 nexthdr = *prevhdr;
601 nexthdr_offset = prevhdr - skb_network_header(skb);
603 mtu = ip6_skb_dst_mtu(skb);
605 /* We must not fragment if the socket is set to force MTU discovery
606 * or if the skb it not generated by a local socket.
608 if (unlikely(!skb->ignore_df && skb->len > mtu))
609 goto fail_toobig;
611 if (IP6CB(skb)->frag_max_size) {
612 if (IP6CB(skb)->frag_max_size > mtu)
613 goto fail_toobig;
615 /* don't send fragments larger than what we received */
616 mtu = IP6CB(skb)->frag_max_size;
617 if (mtu < IPV6_MIN_MTU)
618 mtu = IPV6_MIN_MTU;
621 if (np && np->frag_size < mtu) {
622 if (np->frag_size)
623 mtu = np->frag_size;
625 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
626 goto fail_toobig;
627 mtu -= hlen + sizeof(struct frag_hdr);
629 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
630 &ipv6_hdr(skb)->saddr);
632 if (skb->ip_summed == CHECKSUM_PARTIAL &&
633 (err = skb_checksum_help(skb)))
634 goto fail;
636 prevhdr = skb_network_header(skb) + nexthdr_offset;
637 hroom = LL_RESERVED_SPACE(rt->dst.dev);
638 if (skb_has_frag_list(skb)) {
639 unsigned int first_len = skb_pagelen(skb);
640 struct sk_buff *frag2;
642 if (first_len - hlen > mtu ||
643 ((first_len - hlen) & 7) ||
644 skb_cloned(skb) ||
645 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
646 goto slow_path;
648 skb_walk_frags(skb, frag) {
649 /* Correct geometry. */
650 if (frag->len > mtu ||
651 ((frag->len & 7) && frag->next) ||
652 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
653 goto slow_path_clean;
655 /* Partially cloned skb? */
656 if (skb_shared(frag))
657 goto slow_path_clean;
659 BUG_ON(frag->sk);
660 if (skb->sk) {
661 frag->sk = skb->sk;
662 frag->destructor = sock_wfree;
664 skb->truesize -= frag->truesize;
667 err = 0;
668 offset = 0;
669 /* BUILD HEADER */
671 *prevhdr = NEXTHDR_FRAGMENT;
672 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
673 if (!tmp_hdr) {
674 err = -ENOMEM;
675 goto fail;
677 frag = skb_shinfo(skb)->frag_list;
678 skb_frag_list_init(skb);
680 __skb_pull(skb, hlen);
681 fh = __skb_push(skb, sizeof(struct frag_hdr));
682 __skb_push(skb, hlen);
683 skb_reset_network_header(skb);
684 memcpy(skb_network_header(skb), tmp_hdr, hlen);
686 fh->nexthdr = nexthdr;
687 fh->reserved = 0;
688 fh->frag_off = htons(IP6_MF);
689 fh->identification = frag_id;
691 first_len = skb_pagelen(skb);
692 skb->data_len = first_len - skb_headlen(skb);
693 skb->len = first_len;
694 ipv6_hdr(skb)->payload_len = htons(first_len -
695 sizeof(struct ipv6hdr));
697 for (;;) {
698 /* Prepare header of the next frame,
699 * before previous one went down. */
700 if (frag) {
701 frag->ip_summed = CHECKSUM_NONE;
702 skb_reset_transport_header(frag);
703 fh = __skb_push(frag, sizeof(struct frag_hdr));
704 __skb_push(frag, hlen);
705 skb_reset_network_header(frag);
706 memcpy(skb_network_header(frag), tmp_hdr,
707 hlen);
708 offset += skb->len - hlen - sizeof(struct frag_hdr);
709 fh->nexthdr = nexthdr;
710 fh->reserved = 0;
711 fh->frag_off = htons(offset);
712 if (frag->next)
713 fh->frag_off |= htons(IP6_MF);
714 fh->identification = frag_id;
715 ipv6_hdr(frag)->payload_len =
716 htons(frag->len -
717 sizeof(struct ipv6hdr));
718 ip6_copy_metadata(frag, skb);
721 err = output(net, sk, skb);
722 if (!err)
723 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
724 IPSTATS_MIB_FRAGCREATES);
726 if (err || !frag)
727 break;
729 skb = frag;
730 frag = skb->next;
731 skb->next = NULL;
734 kfree(tmp_hdr);
736 if (err == 0) {
737 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
738 IPSTATS_MIB_FRAGOKS);
739 return 0;
742 kfree_skb_list(frag);
744 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
745 IPSTATS_MIB_FRAGFAILS);
746 return err;
748 slow_path_clean:
749 skb_walk_frags(skb, frag2) {
750 if (frag2 == frag)
751 break;
752 frag2->sk = NULL;
753 frag2->destructor = NULL;
754 skb->truesize += frag2->truesize;
758 slow_path:
759 left = skb->len - hlen; /* Space per frame */
760 ptr = hlen; /* Where to start from */
763 * Fragment the datagram.
766 troom = rt->dst.dev->needed_tailroom;
769 * Keep copying data until we run out.
771 while (left > 0) {
772 u8 *fragnexthdr_offset;
774 len = left;
775 /* IF: it doesn't fit, use 'mtu' - the data space left */
776 if (len > mtu)
777 len = mtu;
778 /* IF: we are not sending up to and including the packet end
779 then align the next start on an eight byte boundary */
780 if (len < left) {
781 len &= ~7;
784 /* Allocate buffer */
785 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
786 hroom + troom, GFP_ATOMIC);
787 if (!frag) {
788 err = -ENOMEM;
789 goto fail;
793 * Set up data on packet
796 ip6_copy_metadata(frag, skb);
797 skb_reserve(frag, hroom);
798 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
799 skb_reset_network_header(frag);
800 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
801 frag->transport_header = (frag->network_header + hlen +
802 sizeof(struct frag_hdr));
805 * Charge the memory for the fragment to any owner
806 * it might possess
808 if (skb->sk)
809 skb_set_owner_w(frag, skb->sk);
812 * Copy the packet header into the new buffer.
814 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
816 fragnexthdr_offset = skb_network_header(frag);
817 fragnexthdr_offset += prevhdr - skb_network_header(skb);
818 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
821 * Build fragment header.
823 fh->nexthdr = nexthdr;
824 fh->reserved = 0;
825 fh->identification = frag_id;
828 * Copy a block of the IP datagram.
830 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
831 len));
832 left -= len;
834 fh->frag_off = htons(offset);
835 if (left > 0)
836 fh->frag_off |= htons(IP6_MF);
837 ipv6_hdr(frag)->payload_len = htons(frag->len -
838 sizeof(struct ipv6hdr));
840 ptr += len;
841 offset += len;
844 * Put this fragment into the sending queue.
846 err = output(net, sk, frag);
847 if (err)
848 goto fail;
850 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
851 IPSTATS_MIB_FRAGCREATES);
853 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
854 IPSTATS_MIB_FRAGOKS);
855 consume_skb(skb);
856 return err;
858 fail_toobig:
859 if (skb->sk && dst_allfrag(skb_dst(skb)))
860 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
862 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
863 err = -EMSGSIZE;
865 fail:
866 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
867 IPSTATS_MIB_FRAGFAILS);
868 kfree_skb(skb);
869 return err;
872 static inline int ip6_rt_check(const struct rt6key *rt_key,
873 const struct in6_addr *fl_addr,
874 const struct in6_addr *addr_cache)
876 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
877 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
880 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
881 struct dst_entry *dst,
882 const struct flowi6 *fl6)
884 struct ipv6_pinfo *np = inet6_sk(sk);
885 struct rt6_info *rt;
887 if (!dst)
888 goto out;
890 if (dst->ops->family != AF_INET6) {
891 dst_release(dst);
892 return NULL;
895 rt = (struct rt6_info *)dst;
896 /* Yes, checking route validity in not connected
897 * case is not very simple. Take into account,
898 * that we do not support routing by source, TOS,
899 * and MSG_DONTROUTE --ANK (980726)
901 * 1. ip6_rt_check(): If route was host route,
902 * check that cached destination is current.
903 * If it is network route, we still may
904 * check its validity using saved pointer
905 * to the last used address: daddr_cache.
906 * We do not want to save whole address now,
907 * (because main consumer of this service
908 * is tcp, which has not this problem),
909 * so that the last trick works only on connected
910 * sockets.
911 * 2. oif also should be the same.
913 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
914 #ifdef CONFIG_IPV6_SUBTREES
915 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
916 #endif
917 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
918 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
919 dst_release(dst);
920 dst = NULL;
923 out:
924 return dst;
927 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
928 struct dst_entry **dst, struct flowi6 *fl6)
930 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
931 struct neighbour *n;
932 struct rt6_info *rt;
933 #endif
934 int err;
935 int flags = 0;
937 /* The correct way to handle this would be to do
938 * ip6_route_get_saddr, and then ip6_route_output; however,
939 * the route-specific preferred source forces the
940 * ip6_route_output call _before_ ip6_route_get_saddr.
942 * In source specific routing (no src=any default route),
943 * ip6_route_output will fail given src=any saddr, though, so
944 * that's why we try it again later.
946 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
947 struct fib6_info *from;
948 struct rt6_info *rt;
949 bool had_dst = *dst != NULL;
951 if (!had_dst)
952 *dst = ip6_route_output(net, sk, fl6);
953 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
955 rcu_read_lock();
956 from = rt ? rcu_dereference(rt->from) : NULL;
957 err = ip6_route_get_saddr(net, from, &fl6->daddr,
958 sk ? inet6_sk(sk)->srcprefs : 0,
959 &fl6->saddr);
960 rcu_read_unlock();
962 if (err)
963 goto out_err_release;
965 /* If we had an erroneous initial result, pretend it
966 * never existed and let the SA-enabled version take
967 * over.
969 if (!had_dst && (*dst)->error) {
970 dst_release(*dst);
971 *dst = NULL;
974 if (fl6->flowi6_oif)
975 flags |= RT6_LOOKUP_F_IFACE;
978 if (!*dst)
979 *dst = ip6_route_output_flags(net, sk, fl6, flags);
981 err = (*dst)->error;
982 if (err)
983 goto out_err_release;
985 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
987 * Here if the dst entry we've looked up
988 * has a neighbour entry that is in the INCOMPLETE
989 * state and the src address from the flow is
990 * marked as OPTIMISTIC, we release the found
991 * dst entry and replace it instead with the
992 * dst entry of the nexthop router
994 rt = (struct rt6_info *) *dst;
995 rcu_read_lock_bh();
996 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
997 rt6_nexthop(rt, &fl6->daddr));
998 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
999 rcu_read_unlock_bh();
1001 if (err) {
1002 struct inet6_ifaddr *ifp;
1003 struct flowi6 fl_gw6;
1004 int redirect;
1006 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1007 (*dst)->dev, 1);
1009 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1010 if (ifp)
1011 in6_ifa_put(ifp);
1013 if (redirect) {
1015 * We need to get the dst entry for the
1016 * default router instead
1018 dst_release(*dst);
1019 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1020 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1021 *dst = ip6_route_output(net, sk, &fl_gw6);
1022 err = (*dst)->error;
1023 if (err)
1024 goto out_err_release;
1027 #endif
1028 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1029 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1030 err = -EAFNOSUPPORT;
1031 goto out_err_release;
1034 return 0;
1036 out_err_release:
1037 dst_release(*dst);
1038 *dst = NULL;
1040 if (err == -ENETUNREACH)
1041 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1042 return err;
1046 * ip6_dst_lookup - perform route lookup on flow
1047 * @sk: socket which provides route info
1048 * @dst: pointer to dst_entry * for result
1049 * @fl6: flow to lookup
1051 * This function performs a route lookup on the given flow.
1053 * It returns zero on success, or a standard errno code on error.
1055 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1056 struct flowi6 *fl6)
1058 *dst = NULL;
1059 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1061 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1064 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1065 * @sk: socket which provides route info
1066 * @fl6: flow to lookup
1067 * @final_dst: final destination address for ipsec lookup
1069 * This function performs a route lookup on the given flow.
1071 * It returns a valid dst pointer on success, or a pointer encoded
1072 * error code.
1074 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1075 const struct in6_addr *final_dst)
1077 struct dst_entry *dst = NULL;
1078 int err;
1080 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1081 if (err)
1082 return ERR_PTR(err);
1083 if (final_dst)
1084 fl6->daddr = *final_dst;
1086 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1088 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1091 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1092 * @sk: socket which provides the dst cache and route info
1093 * @fl6: flow to lookup
1094 * @final_dst: final destination address for ipsec lookup
1095 * @connected: whether @sk is connected or not
1097 * This function performs a route lookup on the given flow with the
1098 * possibility of using the cached route in the socket if it is valid.
1099 * It will take the socket dst lock when operating on the dst cache.
1100 * As a result, this function can only be used in process context.
1102 * In addition, for a connected socket, cache the dst in the socket
1103 * if the current cache is not valid.
1105 * It returns a valid dst pointer on success, or a pointer encoded
1106 * error code.
1108 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1109 const struct in6_addr *final_dst,
1110 bool connected)
1112 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1114 dst = ip6_sk_dst_check(sk, dst, fl6);
1115 if (dst)
1116 return dst;
1118 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1119 if (connected && !IS_ERR(dst))
1120 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1122 return dst;
1124 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1126 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1127 gfp_t gfp)
1129 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1132 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1133 gfp_t gfp)
1135 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1138 static void ip6_append_data_mtu(unsigned int *mtu,
1139 int *maxfraglen,
1140 unsigned int fragheaderlen,
1141 struct sk_buff *skb,
1142 struct rt6_info *rt,
1143 unsigned int orig_mtu)
1145 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1146 if (!skb) {
1147 /* first fragment, reserve header_len */
1148 *mtu = orig_mtu - rt->dst.header_len;
1150 } else {
1152 * this fragment is not first, the headers
1153 * space is regarded as data space.
1155 *mtu = orig_mtu;
1157 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1158 + fragheaderlen - sizeof(struct frag_hdr);
1162 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1163 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1164 struct rt6_info *rt, struct flowi6 *fl6)
1166 struct ipv6_pinfo *np = inet6_sk(sk);
1167 unsigned int mtu;
1168 struct ipv6_txoptions *opt = ipc6->opt;
1171 * setup for corking
1173 if (opt) {
1174 if (WARN_ON(v6_cork->opt))
1175 return -EINVAL;
1177 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1178 if (unlikely(!v6_cork->opt))
1179 return -ENOBUFS;
1181 v6_cork->opt->tot_len = sizeof(*opt);
1182 v6_cork->opt->opt_flen = opt->opt_flen;
1183 v6_cork->opt->opt_nflen = opt->opt_nflen;
1185 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1186 sk->sk_allocation);
1187 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1188 return -ENOBUFS;
1190 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1191 sk->sk_allocation);
1192 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1193 return -ENOBUFS;
1195 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1196 sk->sk_allocation);
1197 if (opt->hopopt && !v6_cork->opt->hopopt)
1198 return -ENOBUFS;
1200 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1201 sk->sk_allocation);
1202 if (opt->srcrt && !v6_cork->opt->srcrt)
1203 return -ENOBUFS;
1205 /* need source address above miyazawa*/
1207 dst_hold(&rt->dst);
1208 cork->base.dst = &rt->dst;
1209 cork->fl.u.ip6 = *fl6;
1210 v6_cork->hop_limit = ipc6->hlimit;
1211 v6_cork->tclass = ipc6->tclass;
1212 if (rt->dst.flags & DST_XFRM_TUNNEL)
1213 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1214 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1215 else
1216 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1217 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1218 if (np->frag_size < mtu) {
1219 if (np->frag_size)
1220 mtu = np->frag_size;
1222 if (mtu < IPV6_MIN_MTU)
1223 return -EINVAL;
1224 cork->base.fragsize = mtu;
1225 cork->base.gso_size = ipc6->gso_size;
1226 cork->base.tx_flags = 0;
1227 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1229 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1230 cork->base.flags |= IPCORK_ALLFRAG;
1231 cork->base.length = 0;
1233 cork->base.transmit_time = ipc6->sockc.transmit_time;
1235 return 0;
1238 static int __ip6_append_data(struct sock *sk,
1239 struct flowi6 *fl6,
1240 struct sk_buff_head *queue,
1241 struct inet_cork *cork,
1242 struct inet6_cork *v6_cork,
1243 struct page_frag *pfrag,
1244 int getfrag(void *from, char *to, int offset,
1245 int len, int odd, struct sk_buff *skb),
1246 void *from, int length, int transhdrlen,
1247 unsigned int flags, struct ipcm6_cookie *ipc6)
1249 struct sk_buff *skb, *skb_prev = NULL;
1250 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1251 int exthdrlen = 0;
1252 int dst_exthdrlen = 0;
1253 int hh_len;
1254 int copy;
1255 int err;
1256 int offset = 0;
1257 u32 tskey = 0;
1258 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1259 struct ipv6_txoptions *opt = v6_cork->opt;
1260 int csummode = CHECKSUM_NONE;
1261 unsigned int maxnonfragsize, headersize;
1262 unsigned int wmem_alloc_delta = 0;
1263 bool paged;
1265 skb = skb_peek_tail(queue);
1266 if (!skb) {
1267 exthdrlen = opt ? opt->opt_flen : 0;
1268 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1271 paged = !!cork->gso_size;
1272 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1273 orig_mtu = mtu;
1275 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1276 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1277 tskey = sk->sk_tskey++;
1279 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1281 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1282 (opt ? opt->opt_nflen : 0);
1283 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1284 sizeof(struct frag_hdr);
1286 headersize = sizeof(struct ipv6hdr) +
1287 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1288 (dst_allfrag(&rt->dst) ?
1289 sizeof(struct frag_hdr) : 0) +
1290 rt->rt6i_nfheader_len;
1292 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1293 * the first fragment
1295 if (headersize + transhdrlen > mtu)
1296 goto emsgsize;
1298 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1299 (sk->sk_protocol == IPPROTO_UDP ||
1300 sk->sk_protocol == IPPROTO_RAW)) {
1301 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1302 sizeof(struct ipv6hdr));
1303 goto emsgsize;
1306 if (ip6_sk_ignore_df(sk))
1307 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1308 else
1309 maxnonfragsize = mtu;
1311 if (cork->length + length > maxnonfragsize - headersize) {
1312 emsgsize:
1313 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1314 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1315 return -EMSGSIZE;
1318 /* CHECKSUM_PARTIAL only with no extension headers and when
1319 * we are not going to fragment
1321 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1322 headersize == sizeof(struct ipv6hdr) &&
1323 length <= mtu - headersize &&
1324 (!(flags & MSG_MORE) || cork->gso_size) &&
1325 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1326 csummode = CHECKSUM_PARTIAL;
1329 * Let's try using as much space as possible.
1330 * Use MTU if total length of the message fits into the MTU.
1331 * Otherwise, we need to reserve fragment header and
1332 * fragment alignment (= 8-15 octects, in total).
1334 * Note that we may need to "move" the data from the tail of
1335 * of the buffer to the new fragment when we split
1336 * the message.
1338 * FIXME: It may be fragmented into multiple chunks
1339 * at once if non-fragmentable extension headers
1340 * are too large.
1341 * --yoshfuji
1344 cork->length += length;
1345 if (!skb)
1346 goto alloc_new_skb;
1348 while (length > 0) {
1349 /* Check if the remaining data fits into current packet. */
1350 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1351 if (copy < length)
1352 copy = maxfraglen - skb->len;
1354 if (copy <= 0) {
1355 char *data;
1356 unsigned int datalen;
1357 unsigned int fraglen;
1358 unsigned int fraggap;
1359 unsigned int alloclen;
1360 unsigned int pagedlen;
1361 alloc_new_skb:
1362 /* There's no room in the current skb */
1363 if (skb)
1364 fraggap = skb->len - maxfraglen;
1365 else
1366 fraggap = 0;
1367 /* update mtu and maxfraglen if necessary */
1368 if (!skb || !skb_prev)
1369 ip6_append_data_mtu(&mtu, &maxfraglen,
1370 fragheaderlen, skb, rt,
1371 orig_mtu);
1373 skb_prev = skb;
1376 * If remaining data exceeds the mtu,
1377 * we know we need more fragment(s).
1379 datalen = length + fraggap;
1381 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1382 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1383 fraglen = datalen + fragheaderlen;
1384 pagedlen = 0;
1386 if ((flags & MSG_MORE) &&
1387 !(rt->dst.dev->features&NETIF_F_SG))
1388 alloclen = mtu;
1389 else if (!paged)
1390 alloclen = fraglen;
1391 else {
1392 alloclen = min_t(int, fraglen, MAX_HEADER);
1393 pagedlen = fraglen - alloclen;
1396 alloclen += dst_exthdrlen;
1398 if (datalen != length + fraggap) {
1400 * this is not the last fragment, the trailer
1401 * space is regarded as data space.
1403 datalen += rt->dst.trailer_len;
1406 alloclen += rt->dst.trailer_len;
1407 fraglen = datalen + fragheaderlen;
1410 * We just reserve space for fragment header.
1411 * Note: this may be overallocation if the message
1412 * (without MSG_MORE) fits into the MTU.
1414 alloclen += sizeof(struct frag_hdr);
1416 copy = datalen - transhdrlen - fraggap - pagedlen;
1417 if (copy < 0) {
1418 err = -EINVAL;
1419 goto error;
1421 if (transhdrlen) {
1422 skb = sock_alloc_send_skb(sk,
1423 alloclen + hh_len,
1424 (flags & MSG_DONTWAIT), &err);
1425 } else {
1426 skb = NULL;
1427 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1428 2 * sk->sk_sndbuf)
1429 skb = alloc_skb(alloclen + hh_len,
1430 sk->sk_allocation);
1431 if (unlikely(!skb))
1432 err = -ENOBUFS;
1434 if (!skb)
1435 goto error;
1437 * Fill in the control structures
1439 skb->protocol = htons(ETH_P_IPV6);
1440 skb->ip_summed = csummode;
1441 skb->csum = 0;
1442 /* reserve for fragmentation and ipsec header */
1443 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1444 dst_exthdrlen);
1446 /* Only the initial fragment is time stamped */
1447 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1448 cork->tx_flags = 0;
1449 skb_shinfo(skb)->tskey = tskey;
1450 tskey = 0;
1453 * Find where to start putting bytes
1455 data = skb_put(skb, fraglen - pagedlen);
1456 skb_set_network_header(skb, exthdrlen);
1457 data += fragheaderlen;
1458 skb->transport_header = (skb->network_header +
1459 fragheaderlen);
1460 if (fraggap) {
1461 skb->csum = skb_copy_and_csum_bits(
1462 skb_prev, maxfraglen,
1463 data + transhdrlen, fraggap, 0);
1464 skb_prev->csum = csum_sub(skb_prev->csum,
1465 skb->csum);
1466 data += fraggap;
1467 pskb_trim_unique(skb_prev, maxfraglen);
1469 if (copy > 0 &&
1470 getfrag(from, data + transhdrlen, offset,
1471 copy, fraggap, skb) < 0) {
1472 err = -EFAULT;
1473 kfree_skb(skb);
1474 goto error;
1477 offset += copy;
1478 length -= copy + transhdrlen;
1479 transhdrlen = 0;
1480 exthdrlen = 0;
1481 dst_exthdrlen = 0;
1483 if ((flags & MSG_CONFIRM) && !skb_prev)
1484 skb_set_dst_pending_confirm(skb, 1);
1487 * Put the packet on the pending queue
1489 if (!skb->destructor) {
1490 skb->destructor = sock_wfree;
1491 skb->sk = sk;
1492 wmem_alloc_delta += skb->truesize;
1494 __skb_queue_tail(queue, skb);
1495 continue;
1498 if (copy > length)
1499 copy = length;
1501 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1502 skb_tailroom(skb) >= copy) {
1503 unsigned int off;
1505 off = skb->len;
1506 if (getfrag(from, skb_put(skb, copy),
1507 offset, copy, off, skb) < 0) {
1508 __skb_trim(skb, off);
1509 err = -EFAULT;
1510 goto error;
1512 } else {
1513 int i = skb_shinfo(skb)->nr_frags;
1515 err = -ENOMEM;
1516 if (!sk_page_frag_refill(sk, pfrag))
1517 goto error;
1519 if (!skb_can_coalesce(skb, i, pfrag->page,
1520 pfrag->offset)) {
1521 err = -EMSGSIZE;
1522 if (i == MAX_SKB_FRAGS)
1523 goto error;
1525 __skb_fill_page_desc(skb, i, pfrag->page,
1526 pfrag->offset, 0);
1527 skb_shinfo(skb)->nr_frags = ++i;
1528 get_page(pfrag->page);
1530 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1531 if (getfrag(from,
1532 page_address(pfrag->page) + pfrag->offset,
1533 offset, copy, skb->len, skb) < 0)
1534 goto error_efault;
1536 pfrag->offset += copy;
1537 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1538 skb->len += copy;
1539 skb->data_len += copy;
1540 skb->truesize += copy;
1541 wmem_alloc_delta += copy;
1543 offset += copy;
1544 length -= copy;
1547 if (wmem_alloc_delta)
1548 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1549 return 0;
1551 error_efault:
1552 err = -EFAULT;
1553 error:
1554 cork->length -= length;
1555 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1556 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1557 return err;
1560 int ip6_append_data(struct sock *sk,
1561 int getfrag(void *from, char *to, int offset, int len,
1562 int odd, struct sk_buff *skb),
1563 void *from, int length, int transhdrlen,
1564 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1565 struct rt6_info *rt, unsigned int flags)
1567 struct inet_sock *inet = inet_sk(sk);
1568 struct ipv6_pinfo *np = inet6_sk(sk);
1569 int exthdrlen;
1570 int err;
1572 if (flags&MSG_PROBE)
1573 return 0;
1574 if (skb_queue_empty(&sk->sk_write_queue)) {
1576 * setup for corking
1578 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1579 ipc6, rt, fl6);
1580 if (err)
1581 return err;
1583 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1584 length += exthdrlen;
1585 transhdrlen += exthdrlen;
1586 } else {
1587 fl6 = &inet->cork.fl.u.ip6;
1588 transhdrlen = 0;
1591 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1592 &np->cork, sk_page_frag(sk), getfrag,
1593 from, length, transhdrlen, flags, ipc6);
1595 EXPORT_SYMBOL_GPL(ip6_append_data);
1597 static void ip6_cork_release(struct inet_cork_full *cork,
1598 struct inet6_cork *v6_cork)
1600 if (v6_cork->opt) {
1601 kfree(v6_cork->opt->dst0opt);
1602 kfree(v6_cork->opt->dst1opt);
1603 kfree(v6_cork->opt->hopopt);
1604 kfree(v6_cork->opt->srcrt);
1605 kfree(v6_cork->opt);
1606 v6_cork->opt = NULL;
1609 if (cork->base.dst) {
1610 dst_release(cork->base.dst);
1611 cork->base.dst = NULL;
1612 cork->base.flags &= ~IPCORK_ALLFRAG;
1614 memset(&cork->fl, 0, sizeof(cork->fl));
1617 struct sk_buff *__ip6_make_skb(struct sock *sk,
1618 struct sk_buff_head *queue,
1619 struct inet_cork_full *cork,
1620 struct inet6_cork *v6_cork)
1622 struct sk_buff *skb, *tmp_skb;
1623 struct sk_buff **tail_skb;
1624 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1625 struct ipv6_pinfo *np = inet6_sk(sk);
1626 struct net *net = sock_net(sk);
1627 struct ipv6hdr *hdr;
1628 struct ipv6_txoptions *opt = v6_cork->opt;
1629 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1630 struct flowi6 *fl6 = &cork->fl.u.ip6;
1631 unsigned char proto = fl6->flowi6_proto;
1633 skb = __skb_dequeue(queue);
1634 if (!skb)
1635 goto out;
1636 tail_skb = &(skb_shinfo(skb)->frag_list);
1638 /* move skb->data to ip header from ext header */
1639 if (skb->data < skb_network_header(skb))
1640 __skb_pull(skb, skb_network_offset(skb));
1641 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1642 __skb_pull(tmp_skb, skb_network_header_len(skb));
1643 *tail_skb = tmp_skb;
1644 tail_skb = &(tmp_skb->next);
1645 skb->len += tmp_skb->len;
1646 skb->data_len += tmp_skb->len;
1647 skb->truesize += tmp_skb->truesize;
1648 tmp_skb->destructor = NULL;
1649 tmp_skb->sk = NULL;
1652 /* Allow local fragmentation. */
1653 skb->ignore_df = ip6_sk_ignore_df(sk);
1655 *final_dst = fl6->daddr;
1656 __skb_pull(skb, skb_network_header_len(skb));
1657 if (opt && opt->opt_flen)
1658 ipv6_push_frag_opts(skb, opt, &proto);
1659 if (opt && opt->opt_nflen)
1660 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1662 skb_push(skb, sizeof(struct ipv6hdr));
1663 skb_reset_network_header(skb);
1664 hdr = ipv6_hdr(skb);
1666 ip6_flow_hdr(hdr, v6_cork->tclass,
1667 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1668 ip6_autoflowlabel(net, np), fl6));
1669 hdr->hop_limit = v6_cork->hop_limit;
1670 hdr->nexthdr = proto;
1671 hdr->saddr = fl6->saddr;
1672 hdr->daddr = *final_dst;
1674 skb->priority = sk->sk_priority;
1675 skb->mark = sk->sk_mark;
1677 skb->tstamp = cork->base.transmit_time;
1679 skb_dst_set(skb, dst_clone(&rt->dst));
1680 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1681 if (proto == IPPROTO_ICMPV6) {
1682 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1684 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1685 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1688 ip6_cork_release(cork, v6_cork);
1689 out:
1690 return skb;
1693 int ip6_send_skb(struct sk_buff *skb)
1695 struct net *net = sock_net(skb->sk);
1696 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1697 int err;
1699 err = ip6_local_out(net, skb->sk, skb);
1700 if (err) {
1701 if (err > 0)
1702 err = net_xmit_errno(err);
1703 if (err)
1704 IP6_INC_STATS(net, rt->rt6i_idev,
1705 IPSTATS_MIB_OUTDISCARDS);
1708 return err;
1711 int ip6_push_pending_frames(struct sock *sk)
1713 struct sk_buff *skb;
1715 skb = ip6_finish_skb(sk);
1716 if (!skb)
1717 return 0;
1719 return ip6_send_skb(skb);
1721 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1723 static void __ip6_flush_pending_frames(struct sock *sk,
1724 struct sk_buff_head *queue,
1725 struct inet_cork_full *cork,
1726 struct inet6_cork *v6_cork)
1728 struct sk_buff *skb;
1730 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1731 if (skb_dst(skb))
1732 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1733 IPSTATS_MIB_OUTDISCARDS);
1734 kfree_skb(skb);
1737 ip6_cork_release(cork, v6_cork);
1740 void ip6_flush_pending_frames(struct sock *sk)
1742 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1743 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1745 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1747 struct sk_buff *ip6_make_skb(struct sock *sk,
1748 int getfrag(void *from, char *to, int offset,
1749 int len, int odd, struct sk_buff *skb),
1750 void *from, int length, int transhdrlen,
1751 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1752 struct rt6_info *rt, unsigned int flags,
1753 struct inet_cork_full *cork)
1755 struct inet6_cork v6_cork;
1756 struct sk_buff_head queue;
1757 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1758 int err;
1760 if (flags & MSG_PROBE)
1761 return NULL;
1763 __skb_queue_head_init(&queue);
1765 cork->base.flags = 0;
1766 cork->base.addr = 0;
1767 cork->base.opt = NULL;
1768 cork->base.dst = NULL;
1769 v6_cork.opt = NULL;
1770 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1771 if (err) {
1772 ip6_cork_release(cork, &v6_cork);
1773 return ERR_PTR(err);
1775 if (ipc6->dontfrag < 0)
1776 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1778 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1779 &current->task_frag, getfrag, from,
1780 length + exthdrlen, transhdrlen + exthdrlen,
1781 flags, ipc6);
1782 if (err) {
1783 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1784 return ERR_PTR(err);
1787 return __ip6_make_skb(sk, &queue, cork, &v6_cork);