x86/speculation/mds: Fix documentation typo
[linux/fpc-iii.git] / net / ipv6 / ip6_output.c
blob2af849ba33c9cf8ee1576c798940fa794ace9414
1 /*
2 * IPv6 output functions
3 * Linux INET6 implementation
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
46 #include <net/sock.h>
47 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
64 struct dst_entry *dst = skb_dst(skb);
65 struct net_device *dev = dst->dev;
66 struct neighbour *neigh;
67 struct in6_addr *nexthop;
68 int ret;
70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 ((mroute6_socket(net, skb) &&
75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 &ipv6_hdr(skb)->saddr))) {
78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
80 /* Do not check for IFF_ALLMULTI; multicast routing
81 is not supported in any case.
83 if (newskb)
84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 net, sk, newskb, NULL, newskb->dev,
86 dev_loopback_xmit);
88 if (ipv6_hdr(skb)->hop_limit == 0) {
89 IP6_INC_STATS(net, idev,
90 IPSTATS_MIB_OUTDISCARDS);
91 kfree_skb(skb);
92 return 0;
96 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
98 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 IPV6_ADDR_SCOPE_NODELOCAL &&
100 !(dev->flags & IFF_LOOPBACK)) {
101 kfree_skb(skb);
102 return 0;
106 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 int res = lwtunnel_xmit(skb);
109 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 return res;
113 rcu_read_lock_bh();
114 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 if (unlikely(!neigh))
117 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 if (!IS_ERR(neigh)) {
119 sock_confirm_neigh(skb, neigh);
120 ret = neigh_output(neigh, skb);
121 rcu_read_unlock_bh();
122 return ret;
124 rcu_read_unlock_bh();
126 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 kfree_skb(skb);
128 return -EINVAL;
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
133 int ret;
135 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 if (ret) {
137 kfree_skb(skb);
138 return ret;
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 /* Policy lookup after SNAT yielded a new policy */
143 if (skb_dst(skb)->xfrm) {
144 IPCB(skb)->flags |= IPSKB_REROUTED;
145 return dst_output(net, sk, skb);
147 #endif
149 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 dst_allfrag(skb_dst(skb)) ||
151 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 else
154 return ip6_finish_output2(net, sk, skb);
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
159 struct net_device *dev = skb_dst(skb)->dev;
160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
162 skb->protocol = htons(ETH_P_IPV6);
163 skb->dev = dev;
165 if (unlikely(idev->cnf.disable_ipv6)) {
166 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 kfree_skb(skb);
168 return 0;
171 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 net, sk, skb, NULL, dev,
173 ip6_finish_output,
174 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
179 if (!np->autoflowlabel_set)
180 return ip6_default_np_autolabel(net);
181 else
182 return np->autoflowlabel;
186 * xmit an sk_buff (used by TCP, SCTP and DCCP)
187 * Note : socket lock is not held for SYNACK packets, but might be modified
188 * by calls to skb_set_owner_w() and ipv6_local_error(),
189 * which are using proper atomic operations or spinlocks.
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 __u32 mark, struct ipv6_txoptions *opt, int tclass)
194 struct net *net = sock_net(sk);
195 const struct ipv6_pinfo *np = inet6_sk(sk);
196 struct in6_addr *first_hop = &fl6->daddr;
197 struct dst_entry *dst = skb_dst(skb);
198 unsigned int head_room;
199 struct ipv6hdr *hdr;
200 u8 proto = fl6->flowi6_proto;
201 int seg_len = skb->len;
202 int hlimit = -1;
203 u32 mtu;
205 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
206 if (opt)
207 head_room += opt->opt_nflen + opt->opt_flen;
209 if (unlikely(skb_headroom(skb) < head_room)) {
210 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211 if (!skb2) {
212 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213 IPSTATS_MIB_OUTDISCARDS);
214 kfree_skb(skb);
215 return -ENOBUFS;
217 if (skb->sk)
218 skb_set_owner_w(skb2, skb->sk);
219 consume_skb(skb);
220 skb = skb2;
223 if (opt) {
224 seg_len += opt->opt_nflen + opt->opt_flen;
226 if (opt->opt_flen)
227 ipv6_push_frag_opts(skb, opt, &proto);
229 if (opt->opt_nflen)
230 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
231 &fl6->saddr);
234 skb_push(skb, sizeof(struct ipv6hdr));
235 skb_reset_network_header(skb);
236 hdr = ipv6_hdr(skb);
239 * Fill in the IPv6 header
241 if (np)
242 hlimit = np->hop_limit;
243 if (hlimit < 0)
244 hlimit = ip6_dst_hoplimit(dst);
246 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
247 ip6_autoflowlabel(net, np), fl6));
249 hdr->payload_len = htons(seg_len);
250 hdr->nexthdr = proto;
251 hdr->hop_limit = hlimit;
253 hdr->saddr = fl6->saddr;
254 hdr->daddr = *first_hop;
256 skb->protocol = htons(ETH_P_IPV6);
257 skb->priority = sk->sk_priority;
258 skb->mark = mark;
260 mtu = dst_mtu(dst);
261 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
262 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263 IPSTATS_MIB_OUT, skb->len);
265 /* if egress device is enslaved to an L3 master device pass the
266 * skb to its handler for processing
268 skb = l3mdev_ip6_out((struct sock *)sk, skb);
269 if (unlikely(!skb))
270 return 0;
272 /* hooks should never assume socket lock is held.
273 * we promote our socket to non const
275 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
276 net, (struct sock *)sk, skb, NULL, dst->dev,
277 dst_output);
280 skb->dev = dst->dev;
281 /* ipv6_local_error() does not require socket lock,
282 * we promote our socket to non const
284 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
286 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
287 kfree_skb(skb);
288 return -EMSGSIZE;
290 EXPORT_SYMBOL(ip6_xmit);
292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
294 struct ip6_ra_chain *ra;
295 struct sock *last = NULL;
297 read_lock(&ip6_ra_lock);
298 for (ra = ip6_ra_chain; ra; ra = ra->next) {
299 struct sock *sk = ra->sk;
300 if (sk && ra->sel == sel &&
301 (!sk->sk_bound_dev_if ||
302 sk->sk_bound_dev_if == skb->dev->ifindex)) {
303 if (last) {
304 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
305 if (skb2)
306 rawv6_rcv(last, skb2);
308 last = sk;
312 if (last) {
313 rawv6_rcv(last, skb);
314 read_unlock(&ip6_ra_lock);
315 return 1;
317 read_unlock(&ip6_ra_lock);
318 return 0;
321 static int ip6_forward_proxy_check(struct sk_buff *skb)
323 struct ipv6hdr *hdr = ipv6_hdr(skb);
324 u8 nexthdr = hdr->nexthdr;
325 __be16 frag_off;
326 int offset;
328 if (ipv6_ext_hdr(nexthdr)) {
329 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
330 if (offset < 0)
331 return 0;
332 } else
333 offset = sizeof(struct ipv6hdr);
335 if (nexthdr == IPPROTO_ICMPV6) {
336 struct icmp6hdr *icmp6;
338 if (!pskb_may_pull(skb, (skb_network_header(skb) +
339 offset + 1 - skb->data)))
340 return 0;
342 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
344 switch (icmp6->icmp6_type) {
345 case NDISC_ROUTER_SOLICITATION:
346 case NDISC_ROUTER_ADVERTISEMENT:
347 case NDISC_NEIGHBOUR_SOLICITATION:
348 case NDISC_NEIGHBOUR_ADVERTISEMENT:
349 case NDISC_REDIRECT:
350 /* For reaction involving unicast neighbor discovery
351 * message destined to the proxied address, pass it to
352 * input function.
354 return 1;
355 default:
356 break;
361 * The proxying router can't forward traffic sent to a link-local
362 * address, so signal the sender and discard the packet. This
363 * behavior is clarified by the MIPv6 specification.
365 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
366 dst_link_failure(skb);
367 return -1;
370 return 0;
373 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
374 struct sk_buff *skb)
376 struct dst_entry *dst = skb_dst(skb);
378 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
379 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
381 return dst_output(net, sk, skb);
384 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
386 unsigned int mtu;
387 struct inet6_dev *idev;
389 if (dst_metric_locked(dst, RTAX_MTU)) {
390 mtu = dst_metric_raw(dst, RTAX_MTU);
391 if (mtu)
392 return mtu;
395 mtu = IPV6_MIN_MTU;
396 rcu_read_lock();
397 idev = __in6_dev_get(dst->dev);
398 if (idev)
399 mtu = idev->cnf.mtu6;
400 rcu_read_unlock();
402 return mtu;
405 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
407 if (skb->len <= mtu)
408 return false;
410 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
411 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
412 return true;
414 if (skb->ignore_df)
415 return false;
417 if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
418 return false;
420 return true;
423 int ip6_forward(struct sk_buff *skb)
425 struct dst_entry *dst = skb_dst(skb);
426 struct ipv6hdr *hdr = ipv6_hdr(skb);
427 struct inet6_skb_parm *opt = IP6CB(skb);
428 struct net *net = dev_net(dst->dev);
429 u32 mtu;
431 if (net->ipv6.devconf_all->forwarding == 0)
432 goto error;
434 if (skb->pkt_type != PACKET_HOST)
435 goto drop;
437 if (unlikely(skb->sk))
438 goto drop;
440 if (skb_warn_if_lro(skb))
441 goto drop;
443 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
444 __IP6_INC_STATS(net, ip6_dst_idev(dst),
445 IPSTATS_MIB_INDISCARDS);
446 goto drop;
449 skb_forward_csum(skb);
452 * We DO NOT make any processing on
453 * RA packets, pushing them to user level AS IS
454 * without ane WARRANTY that application will be able
455 * to interpret them. The reason is that we
456 * cannot make anything clever here.
458 * We are not end-node, so that if packet contains
459 * AH/ESP, we cannot make anything.
460 * Defragmentation also would be mistake, RA packets
461 * cannot be fragmented, because there is no warranty
462 * that different fragments will go along one path. --ANK
464 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
465 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
466 return 0;
470 * check and decrement ttl
472 if (hdr->hop_limit <= 1) {
473 /* Force OUTPUT device used as source address */
474 skb->dev = dst->dev;
475 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
476 __IP6_INC_STATS(net, ip6_dst_idev(dst),
477 IPSTATS_MIB_INHDRERRORS);
479 kfree_skb(skb);
480 return -ETIMEDOUT;
483 /* XXX: idev->cnf.proxy_ndp? */
484 if (net->ipv6.devconf_all->proxy_ndp &&
485 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
486 int proxied = ip6_forward_proxy_check(skb);
487 if (proxied > 0)
488 return ip6_input(skb);
489 else if (proxied < 0) {
490 __IP6_INC_STATS(net, ip6_dst_idev(dst),
491 IPSTATS_MIB_INDISCARDS);
492 goto drop;
496 if (!xfrm6_route_forward(skb)) {
497 __IP6_INC_STATS(net, ip6_dst_idev(dst),
498 IPSTATS_MIB_INDISCARDS);
499 goto drop;
501 dst = skb_dst(skb);
503 /* IPv6 specs say nothing about it, but it is clear that we cannot
504 send redirects to source routed frames.
505 We don't send redirects to frames decapsulated from IPsec.
507 if (IP6CB(skb)->iif == dst->dev->ifindex &&
508 opt->srcrt == 0 && !skb_sec_path(skb)) {
509 struct in6_addr *target = NULL;
510 struct inet_peer *peer;
511 struct rt6_info *rt;
514 * incoming and outgoing devices are the same
515 * send a redirect.
518 rt = (struct rt6_info *) dst;
519 if (rt->rt6i_flags & RTF_GATEWAY)
520 target = &rt->rt6i_gateway;
521 else
522 target = &hdr->daddr;
524 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
526 /* Limit redirects both by destination (here)
527 and by source (inside ndisc_send_redirect)
529 if (inet_peer_xrlim_allow(peer, 1*HZ))
530 ndisc_send_redirect(skb, target);
531 if (peer)
532 inet_putpeer(peer);
533 } else {
534 int addrtype = ipv6_addr_type(&hdr->saddr);
536 /* This check is security critical. */
537 if (addrtype == IPV6_ADDR_ANY ||
538 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
539 goto error;
540 if (addrtype & IPV6_ADDR_LINKLOCAL) {
541 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
542 ICMPV6_NOT_NEIGHBOUR, 0);
543 goto error;
547 mtu = ip6_dst_mtu_forward(dst);
548 if (mtu < IPV6_MIN_MTU)
549 mtu = IPV6_MIN_MTU;
551 if (ip6_pkt_too_big(skb, mtu)) {
552 /* Again, force OUTPUT device used as source address */
553 skb->dev = dst->dev;
554 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
555 __IP6_INC_STATS(net, ip6_dst_idev(dst),
556 IPSTATS_MIB_INTOOBIGERRORS);
557 __IP6_INC_STATS(net, ip6_dst_idev(dst),
558 IPSTATS_MIB_FRAGFAILS);
559 kfree_skb(skb);
560 return -EMSGSIZE;
563 if (skb_cow(skb, dst->dev->hard_header_len)) {
564 __IP6_INC_STATS(net, ip6_dst_idev(dst),
565 IPSTATS_MIB_OUTDISCARDS);
566 goto drop;
569 hdr = ipv6_hdr(skb);
571 /* Mangling hops number delayed to point after skb COW */
573 hdr->hop_limit--;
575 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
576 net, NULL, skb, skb->dev, dst->dev,
577 ip6_forward_finish);
579 error:
580 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
581 drop:
582 kfree_skb(skb);
583 return -EINVAL;
586 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
588 to->pkt_type = from->pkt_type;
589 to->priority = from->priority;
590 to->protocol = from->protocol;
591 skb_dst_drop(to);
592 skb_dst_set(to, dst_clone(skb_dst(from)));
593 to->dev = from->dev;
594 to->mark = from->mark;
596 skb_copy_hash(to, from);
598 #ifdef CONFIG_NET_SCHED
599 to->tc_index = from->tc_index;
600 #endif
601 nf_copy(to, from);
602 skb_copy_secmark(to, from);
605 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
606 int (*output)(struct net *, struct sock *, struct sk_buff *))
608 struct sk_buff *frag;
609 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
610 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
611 inet6_sk(skb->sk) : NULL;
612 struct ipv6hdr *tmp_hdr;
613 struct frag_hdr *fh;
614 unsigned int mtu, hlen, left, len, nexthdr_offset;
615 int hroom, troom;
616 __be32 frag_id;
617 int ptr, offset = 0, err = 0;
618 u8 *prevhdr, nexthdr = 0;
620 err = ip6_find_1stfragopt(skb, &prevhdr);
621 if (err < 0)
622 goto fail;
623 hlen = err;
624 nexthdr = *prevhdr;
625 nexthdr_offset = prevhdr - skb_network_header(skb);
627 mtu = ip6_skb_dst_mtu(skb);
629 /* We must not fragment if the socket is set to force MTU discovery
630 * or if the skb it not generated by a local socket.
632 if (unlikely(!skb->ignore_df && skb->len > mtu))
633 goto fail_toobig;
635 if (IP6CB(skb)->frag_max_size) {
636 if (IP6CB(skb)->frag_max_size > mtu)
637 goto fail_toobig;
639 /* don't send fragments larger than what we received */
640 mtu = IP6CB(skb)->frag_max_size;
641 if (mtu < IPV6_MIN_MTU)
642 mtu = IPV6_MIN_MTU;
645 if (np && np->frag_size < mtu) {
646 if (np->frag_size)
647 mtu = np->frag_size;
649 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
650 goto fail_toobig;
651 mtu -= hlen + sizeof(struct frag_hdr);
653 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
654 &ipv6_hdr(skb)->saddr);
656 if (skb->ip_summed == CHECKSUM_PARTIAL &&
657 (err = skb_checksum_help(skb)))
658 goto fail;
660 prevhdr = skb_network_header(skb) + nexthdr_offset;
661 hroom = LL_RESERVED_SPACE(rt->dst.dev);
662 if (skb_has_frag_list(skb)) {
663 unsigned int first_len = skb_pagelen(skb);
664 struct sk_buff *frag2;
666 if (first_len - hlen > mtu ||
667 ((first_len - hlen) & 7) ||
668 skb_cloned(skb) ||
669 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
670 goto slow_path;
672 skb_walk_frags(skb, frag) {
673 /* Correct geometry. */
674 if (frag->len > mtu ||
675 ((frag->len & 7) && frag->next) ||
676 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
677 goto slow_path_clean;
679 /* Partially cloned skb? */
680 if (skb_shared(frag))
681 goto slow_path_clean;
683 BUG_ON(frag->sk);
684 if (skb->sk) {
685 frag->sk = skb->sk;
686 frag->destructor = sock_wfree;
688 skb->truesize -= frag->truesize;
691 err = 0;
692 offset = 0;
693 /* BUILD HEADER */
695 *prevhdr = NEXTHDR_FRAGMENT;
696 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
697 if (!tmp_hdr) {
698 err = -ENOMEM;
699 goto fail;
701 frag = skb_shinfo(skb)->frag_list;
702 skb_frag_list_init(skb);
704 __skb_pull(skb, hlen);
705 fh = __skb_push(skb, sizeof(struct frag_hdr));
706 __skb_push(skb, hlen);
707 skb_reset_network_header(skb);
708 memcpy(skb_network_header(skb), tmp_hdr, hlen);
710 fh->nexthdr = nexthdr;
711 fh->reserved = 0;
712 fh->frag_off = htons(IP6_MF);
713 fh->identification = frag_id;
715 first_len = skb_pagelen(skb);
716 skb->data_len = first_len - skb_headlen(skb);
717 skb->len = first_len;
718 ipv6_hdr(skb)->payload_len = htons(first_len -
719 sizeof(struct ipv6hdr));
721 for (;;) {
722 /* Prepare header of the next frame,
723 * before previous one went down. */
724 if (frag) {
725 frag->ip_summed = CHECKSUM_NONE;
726 skb_reset_transport_header(frag);
727 fh = __skb_push(frag, sizeof(struct frag_hdr));
728 __skb_push(frag, hlen);
729 skb_reset_network_header(frag);
730 memcpy(skb_network_header(frag), tmp_hdr,
731 hlen);
732 offset += skb->len - hlen - sizeof(struct frag_hdr);
733 fh->nexthdr = nexthdr;
734 fh->reserved = 0;
735 fh->frag_off = htons(offset);
736 if (frag->next)
737 fh->frag_off |= htons(IP6_MF);
738 fh->identification = frag_id;
739 ipv6_hdr(frag)->payload_len =
740 htons(frag->len -
741 sizeof(struct ipv6hdr));
742 ip6_copy_metadata(frag, skb);
745 err = output(net, sk, skb);
746 if (!err)
747 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
748 IPSTATS_MIB_FRAGCREATES);
750 if (err || !frag)
751 break;
753 skb = frag;
754 frag = skb->next;
755 skb->next = NULL;
758 kfree(tmp_hdr);
760 if (err == 0) {
761 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
762 IPSTATS_MIB_FRAGOKS);
763 return 0;
766 kfree_skb_list(frag);
768 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
769 IPSTATS_MIB_FRAGFAILS);
770 return err;
772 slow_path_clean:
773 skb_walk_frags(skb, frag2) {
774 if (frag2 == frag)
775 break;
776 frag2->sk = NULL;
777 frag2->destructor = NULL;
778 skb->truesize += frag2->truesize;
782 slow_path:
783 left = skb->len - hlen; /* Space per frame */
784 ptr = hlen; /* Where to start from */
787 * Fragment the datagram.
790 troom = rt->dst.dev->needed_tailroom;
793 * Keep copying data until we run out.
795 while (left > 0) {
796 u8 *fragnexthdr_offset;
798 len = left;
799 /* IF: it doesn't fit, use 'mtu' - the data space left */
800 if (len > mtu)
801 len = mtu;
802 /* IF: we are not sending up to and including the packet end
803 then align the next start on an eight byte boundary */
804 if (len < left) {
805 len &= ~7;
808 /* Allocate buffer */
809 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
810 hroom + troom, GFP_ATOMIC);
811 if (!frag) {
812 err = -ENOMEM;
813 goto fail;
817 * Set up data on packet
820 ip6_copy_metadata(frag, skb);
821 skb_reserve(frag, hroom);
822 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
823 skb_reset_network_header(frag);
824 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
825 frag->transport_header = (frag->network_header + hlen +
826 sizeof(struct frag_hdr));
829 * Charge the memory for the fragment to any owner
830 * it might possess
832 if (skb->sk)
833 skb_set_owner_w(frag, skb->sk);
836 * Copy the packet header into the new buffer.
838 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
840 fragnexthdr_offset = skb_network_header(frag);
841 fragnexthdr_offset += prevhdr - skb_network_header(skb);
842 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
845 * Build fragment header.
847 fh->nexthdr = nexthdr;
848 fh->reserved = 0;
849 fh->identification = frag_id;
852 * Copy a block of the IP datagram.
854 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
855 len));
856 left -= len;
858 fh->frag_off = htons(offset);
859 if (left > 0)
860 fh->frag_off |= htons(IP6_MF);
861 ipv6_hdr(frag)->payload_len = htons(frag->len -
862 sizeof(struct ipv6hdr));
864 ptr += len;
865 offset += len;
868 * Put this fragment into the sending queue.
870 err = output(net, sk, frag);
871 if (err)
872 goto fail;
874 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
875 IPSTATS_MIB_FRAGCREATES);
877 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
878 IPSTATS_MIB_FRAGOKS);
879 consume_skb(skb);
880 return err;
882 fail_toobig:
883 if (skb->sk && dst_allfrag(skb_dst(skb)))
884 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
886 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
887 err = -EMSGSIZE;
889 fail:
890 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
891 IPSTATS_MIB_FRAGFAILS);
892 kfree_skb(skb);
893 return err;
896 static inline int ip6_rt_check(const struct rt6key *rt_key,
897 const struct in6_addr *fl_addr,
898 const struct in6_addr *addr_cache)
900 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
901 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
904 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
905 struct dst_entry *dst,
906 const struct flowi6 *fl6)
908 struct ipv6_pinfo *np = inet6_sk(sk);
909 struct rt6_info *rt;
911 if (!dst)
912 goto out;
914 if (dst->ops->family != AF_INET6) {
915 dst_release(dst);
916 return NULL;
919 rt = (struct rt6_info *)dst;
920 /* Yes, checking route validity in not connected
921 * case is not very simple. Take into account,
922 * that we do not support routing by source, TOS,
923 * and MSG_DONTROUTE --ANK (980726)
925 * 1. ip6_rt_check(): If route was host route,
926 * check that cached destination is current.
927 * If it is network route, we still may
928 * check its validity using saved pointer
929 * to the last used address: daddr_cache.
930 * We do not want to save whole address now,
931 * (because main consumer of this service
932 * is tcp, which has not this problem),
933 * so that the last trick works only on connected
934 * sockets.
935 * 2. oif also should be the same.
937 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
938 #ifdef CONFIG_IPV6_SUBTREES
939 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
940 #endif
941 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
942 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
943 dst_release(dst);
944 dst = NULL;
947 out:
948 return dst;
951 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
952 struct dst_entry **dst, struct flowi6 *fl6)
954 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
955 struct neighbour *n;
956 struct rt6_info *rt;
957 #endif
958 int err;
959 int flags = 0;
961 /* The correct way to handle this would be to do
962 * ip6_route_get_saddr, and then ip6_route_output; however,
963 * the route-specific preferred source forces the
964 * ip6_route_output call _before_ ip6_route_get_saddr.
966 * In source specific routing (no src=any default route),
967 * ip6_route_output will fail given src=any saddr, though, so
968 * that's why we try it again later.
970 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
971 struct rt6_info *rt;
972 bool had_dst = *dst != NULL;
974 if (!had_dst)
975 *dst = ip6_route_output(net, sk, fl6);
976 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
977 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
978 sk ? inet6_sk(sk)->srcprefs : 0,
979 &fl6->saddr);
980 if (err)
981 goto out_err_release;
983 /* If we had an erroneous initial result, pretend it
984 * never existed and let the SA-enabled version take
985 * over.
987 if (!had_dst && (*dst)->error) {
988 dst_release(*dst);
989 *dst = NULL;
992 if (fl6->flowi6_oif)
993 flags |= RT6_LOOKUP_F_IFACE;
996 if (!*dst)
997 *dst = ip6_route_output_flags(net, sk, fl6, flags);
999 err = (*dst)->error;
1000 if (err)
1001 goto out_err_release;
1003 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1005 * Here if the dst entry we've looked up
1006 * has a neighbour entry that is in the INCOMPLETE
1007 * state and the src address from the flow is
1008 * marked as OPTIMISTIC, we release the found
1009 * dst entry and replace it instead with the
1010 * dst entry of the nexthop router
1012 rt = (struct rt6_info *) *dst;
1013 rcu_read_lock_bh();
1014 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1015 rt6_nexthop(rt, &fl6->daddr));
1016 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1017 rcu_read_unlock_bh();
1019 if (err) {
1020 struct inet6_ifaddr *ifp;
1021 struct flowi6 fl_gw6;
1022 int redirect;
1024 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1025 (*dst)->dev, 1);
1027 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1028 if (ifp)
1029 in6_ifa_put(ifp);
1031 if (redirect) {
1033 * We need to get the dst entry for the
1034 * default router instead
1036 dst_release(*dst);
1037 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1038 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1039 *dst = ip6_route_output(net, sk, &fl_gw6);
1040 err = (*dst)->error;
1041 if (err)
1042 goto out_err_release;
1045 #endif
1046 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1047 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1048 err = -EAFNOSUPPORT;
1049 goto out_err_release;
1052 return 0;
1054 out_err_release:
1055 dst_release(*dst);
1056 *dst = NULL;
1058 if (err == -ENETUNREACH)
1059 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1060 return err;
1064 * ip6_dst_lookup - perform route lookup on flow
1065 * @sk: socket which provides route info
1066 * @dst: pointer to dst_entry * for result
1067 * @fl6: flow to lookup
1069 * This function performs a route lookup on the given flow.
1071 * It returns zero on success, or a standard errno code on error.
1073 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1074 struct flowi6 *fl6)
1076 *dst = NULL;
1077 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1079 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1082 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1083 * @sk: socket which provides route info
1084 * @fl6: flow to lookup
1085 * @final_dst: final destination address for ipsec lookup
1087 * This function performs a route lookup on the given flow.
1089 * It returns a valid dst pointer on success, or a pointer encoded
1090 * error code.
1092 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1093 const struct in6_addr *final_dst)
1095 struct dst_entry *dst = NULL;
1096 int err;
1098 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1099 if (err)
1100 return ERR_PTR(err);
1101 if (final_dst)
1102 fl6->daddr = *final_dst;
1104 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1106 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1109 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1110 * @sk: socket which provides the dst cache and route info
1111 * @fl6: flow to lookup
1112 * @final_dst: final destination address for ipsec lookup
1114 * This function performs a route lookup on the given flow with the
1115 * possibility of using the cached route in the socket if it is valid.
1116 * It will take the socket dst lock when operating on the dst cache.
1117 * As a result, this function can only be used in process context.
1119 * It returns a valid dst pointer on success, or a pointer encoded
1120 * error code.
1122 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1123 const struct in6_addr *final_dst)
1125 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1127 dst = ip6_sk_dst_check(sk, dst, fl6);
1128 if (!dst)
1129 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1131 return dst;
1133 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1135 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1136 gfp_t gfp)
1138 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1141 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1142 gfp_t gfp)
1144 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1147 static void ip6_append_data_mtu(unsigned int *mtu,
1148 int *maxfraglen,
1149 unsigned int fragheaderlen,
1150 struct sk_buff *skb,
1151 struct rt6_info *rt,
1152 unsigned int orig_mtu)
1154 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1155 if (!skb) {
1156 /* first fragment, reserve header_len */
1157 *mtu = orig_mtu - rt->dst.header_len;
1159 } else {
1161 * this fragment is not first, the headers
1162 * space is regarded as data space.
1164 *mtu = orig_mtu;
1166 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1167 + fragheaderlen - sizeof(struct frag_hdr);
1171 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1172 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1173 struct rt6_info *rt, struct flowi6 *fl6)
1175 struct ipv6_pinfo *np = inet6_sk(sk);
1176 unsigned int mtu;
1177 struct ipv6_txoptions *opt = ipc6->opt;
1180 * setup for corking
1182 if (opt) {
1183 if (WARN_ON(v6_cork->opt))
1184 return -EINVAL;
1186 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1187 if (unlikely(!v6_cork->opt))
1188 return -ENOBUFS;
1190 v6_cork->opt->tot_len = sizeof(*opt);
1191 v6_cork->opt->opt_flen = opt->opt_flen;
1192 v6_cork->opt->opt_nflen = opt->opt_nflen;
1194 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1195 sk->sk_allocation);
1196 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1197 return -ENOBUFS;
1199 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1200 sk->sk_allocation);
1201 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1202 return -ENOBUFS;
1204 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1205 sk->sk_allocation);
1206 if (opt->hopopt && !v6_cork->opt->hopopt)
1207 return -ENOBUFS;
1209 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1210 sk->sk_allocation);
1211 if (opt->srcrt && !v6_cork->opt->srcrt)
1212 return -ENOBUFS;
1214 /* need source address above miyazawa*/
1216 dst_hold(&rt->dst);
1217 cork->base.dst = &rt->dst;
1218 cork->fl.u.ip6 = *fl6;
1219 v6_cork->hop_limit = ipc6->hlimit;
1220 v6_cork->tclass = ipc6->tclass;
1221 if (rt->dst.flags & DST_XFRM_TUNNEL)
1222 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1223 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1224 else
1225 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1226 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
1227 if (np->frag_size < mtu) {
1228 if (np->frag_size)
1229 mtu = np->frag_size;
1231 if (mtu < IPV6_MIN_MTU)
1232 return -EINVAL;
1233 cork->base.fragsize = mtu;
1234 if (dst_allfrag(rt->dst.path))
1235 cork->base.flags |= IPCORK_ALLFRAG;
1236 cork->base.length = 0;
1238 return 0;
1241 static int __ip6_append_data(struct sock *sk,
1242 struct flowi6 *fl6,
1243 struct sk_buff_head *queue,
1244 struct inet_cork *cork,
1245 struct inet6_cork *v6_cork,
1246 struct page_frag *pfrag,
1247 int getfrag(void *from, char *to, int offset,
1248 int len, int odd, struct sk_buff *skb),
1249 void *from, int length, int transhdrlen,
1250 unsigned int flags, struct ipcm6_cookie *ipc6,
1251 const struct sockcm_cookie *sockc)
1253 struct sk_buff *skb, *skb_prev = NULL;
1254 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1255 int exthdrlen = 0;
1256 int dst_exthdrlen = 0;
1257 int hh_len;
1258 int copy;
1259 int err;
1260 int offset = 0;
1261 __u8 tx_flags = 0;
1262 u32 tskey = 0;
1263 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1264 struct ipv6_txoptions *opt = v6_cork->opt;
1265 int csummode = CHECKSUM_NONE;
1266 unsigned int maxnonfragsize, headersize;
1268 skb = skb_peek_tail(queue);
1269 if (!skb) {
1270 exthdrlen = opt ? opt->opt_flen : 0;
1271 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1274 mtu = cork->fragsize;
1275 orig_mtu = mtu;
1277 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1279 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1280 (opt ? opt->opt_nflen : 0);
1281 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1282 sizeof(struct frag_hdr);
1284 headersize = sizeof(struct ipv6hdr) +
1285 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1286 (dst_allfrag(&rt->dst) ?
1287 sizeof(struct frag_hdr) : 0) +
1288 rt->rt6i_nfheader_len;
1290 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1291 * the first fragment
1293 if (headersize + transhdrlen > mtu)
1294 goto emsgsize;
1296 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1297 (sk->sk_protocol == IPPROTO_UDP ||
1298 sk->sk_protocol == IPPROTO_RAW)) {
1299 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1300 sizeof(struct ipv6hdr));
1301 goto emsgsize;
1304 if (ip6_sk_ignore_df(sk))
1305 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1306 else
1307 maxnonfragsize = mtu;
1309 if (cork->length + length > maxnonfragsize - headersize) {
1310 emsgsize:
1311 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1312 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1313 return -EMSGSIZE;
1316 /* CHECKSUM_PARTIAL only with no extension headers and when
1317 * we are not going to fragment
1319 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1320 headersize == sizeof(struct ipv6hdr) &&
1321 length <= mtu - headersize &&
1322 !(flags & MSG_MORE) &&
1323 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1324 csummode = CHECKSUM_PARTIAL;
1326 if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1327 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1328 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1329 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1330 tskey = sk->sk_tskey++;
1334 * Let's try using as much space as possible.
1335 * Use MTU if total length of the message fits into the MTU.
1336 * Otherwise, we need to reserve fragment header and
1337 * fragment alignment (= 8-15 octects, in total).
1339 * Note that we may need to "move" the data from the tail of
1340 * of the buffer to the new fragment when we split
1341 * the message.
1343 * FIXME: It may be fragmented into multiple chunks
1344 * at once if non-fragmentable extension headers
1345 * are too large.
1346 * --yoshfuji
1349 cork->length += length;
1350 if (!skb)
1351 goto alloc_new_skb;
1353 while (length > 0) {
1354 /* Check if the remaining data fits into current packet. */
1355 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1356 if (copy < length)
1357 copy = maxfraglen - skb->len;
1359 if (copy <= 0) {
1360 char *data;
1361 unsigned int datalen;
1362 unsigned int fraglen;
1363 unsigned int fraggap;
1364 unsigned int alloclen;
1365 alloc_new_skb:
1366 /* There's no room in the current skb */
1367 if (skb)
1368 fraggap = skb->len - maxfraglen;
1369 else
1370 fraggap = 0;
1371 /* update mtu and maxfraglen if necessary */
1372 if (!skb || !skb_prev)
1373 ip6_append_data_mtu(&mtu, &maxfraglen,
1374 fragheaderlen, skb, rt,
1375 orig_mtu);
1377 skb_prev = skb;
1380 * If remaining data exceeds the mtu,
1381 * we know we need more fragment(s).
1383 datalen = length + fraggap;
1385 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1386 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1387 if ((flags & MSG_MORE) &&
1388 !(rt->dst.dev->features&NETIF_F_SG))
1389 alloclen = mtu;
1390 else
1391 alloclen = datalen + fragheaderlen;
1393 alloclen += dst_exthdrlen;
1395 if (datalen != length + fraggap) {
1397 * this is not the last fragment, the trailer
1398 * space is regarded as data space.
1400 datalen += rt->dst.trailer_len;
1403 alloclen += rt->dst.trailer_len;
1404 fraglen = datalen + fragheaderlen;
1407 * We just reserve space for fragment header.
1408 * Note: this may be overallocation if the message
1409 * (without MSG_MORE) fits into the MTU.
1411 alloclen += sizeof(struct frag_hdr);
1413 copy = datalen - transhdrlen - fraggap;
1414 if (copy < 0) {
1415 err = -EINVAL;
1416 goto error;
1418 if (transhdrlen) {
1419 skb = sock_alloc_send_skb(sk,
1420 alloclen + hh_len,
1421 (flags & MSG_DONTWAIT), &err);
1422 } else {
1423 skb = NULL;
1424 if (refcount_read(&sk->sk_wmem_alloc) <=
1425 2 * sk->sk_sndbuf)
1426 skb = sock_wmalloc(sk,
1427 alloclen + hh_len, 1,
1428 sk->sk_allocation);
1429 if (unlikely(!skb))
1430 err = -ENOBUFS;
1432 if (!skb)
1433 goto error;
1435 * Fill in the control structures
1437 skb->protocol = htons(ETH_P_IPV6);
1438 skb->ip_summed = csummode;
1439 skb->csum = 0;
1440 /* reserve for fragmentation and ipsec header */
1441 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1442 dst_exthdrlen);
1444 /* Only the initial fragment is time stamped */
1445 skb_shinfo(skb)->tx_flags = tx_flags;
1446 tx_flags = 0;
1447 skb_shinfo(skb)->tskey = tskey;
1448 tskey = 0;
1451 * Find where to start putting bytes
1453 data = skb_put(skb, fraglen);
1454 skb_set_network_header(skb, exthdrlen);
1455 data += fragheaderlen;
1456 skb->transport_header = (skb->network_header +
1457 fragheaderlen);
1458 if (fraggap) {
1459 skb->csum = skb_copy_and_csum_bits(
1460 skb_prev, maxfraglen,
1461 data + transhdrlen, fraggap, 0);
1462 skb_prev->csum = csum_sub(skb_prev->csum,
1463 skb->csum);
1464 data += fraggap;
1465 pskb_trim_unique(skb_prev, maxfraglen);
1467 if (copy > 0 &&
1468 getfrag(from, data + transhdrlen, offset,
1469 copy, fraggap, skb) < 0) {
1470 err = -EFAULT;
1471 kfree_skb(skb);
1472 goto error;
1475 offset += copy;
1476 length -= datalen - fraggap;
1477 transhdrlen = 0;
1478 exthdrlen = 0;
1479 dst_exthdrlen = 0;
1481 if ((flags & MSG_CONFIRM) && !skb_prev)
1482 skb_set_dst_pending_confirm(skb, 1);
1485 * Put the packet on the pending queue
1487 __skb_queue_tail(queue, skb);
1488 continue;
1491 if (copy > length)
1492 copy = length;
1494 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1495 skb_tailroom(skb) >= copy) {
1496 unsigned int off;
1498 off = skb->len;
1499 if (getfrag(from, skb_put(skb, copy),
1500 offset, copy, off, skb) < 0) {
1501 __skb_trim(skb, off);
1502 err = -EFAULT;
1503 goto error;
1505 } else {
1506 int i = skb_shinfo(skb)->nr_frags;
1508 err = -ENOMEM;
1509 if (!sk_page_frag_refill(sk, pfrag))
1510 goto error;
1512 if (!skb_can_coalesce(skb, i, pfrag->page,
1513 pfrag->offset)) {
1514 err = -EMSGSIZE;
1515 if (i == MAX_SKB_FRAGS)
1516 goto error;
1518 __skb_fill_page_desc(skb, i, pfrag->page,
1519 pfrag->offset, 0);
1520 skb_shinfo(skb)->nr_frags = ++i;
1521 get_page(pfrag->page);
1523 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1524 if (getfrag(from,
1525 page_address(pfrag->page) + pfrag->offset,
1526 offset, copy, skb->len, skb) < 0)
1527 goto error_efault;
1529 pfrag->offset += copy;
1530 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1531 skb->len += copy;
1532 skb->data_len += copy;
1533 skb->truesize += copy;
1534 refcount_add(copy, &sk->sk_wmem_alloc);
1536 offset += copy;
1537 length -= copy;
1540 return 0;
1542 error_efault:
1543 err = -EFAULT;
1544 error:
1545 cork->length -= length;
1546 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1547 return err;
1550 int ip6_append_data(struct sock *sk,
1551 int getfrag(void *from, char *to, int offset, int len,
1552 int odd, struct sk_buff *skb),
1553 void *from, int length, int transhdrlen,
1554 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1555 struct rt6_info *rt, unsigned int flags,
1556 const struct sockcm_cookie *sockc)
1558 struct inet_sock *inet = inet_sk(sk);
1559 struct ipv6_pinfo *np = inet6_sk(sk);
1560 int exthdrlen;
1561 int err;
1563 if (flags&MSG_PROBE)
1564 return 0;
1565 if (skb_queue_empty(&sk->sk_write_queue)) {
1567 * setup for corking
1569 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1570 ipc6, rt, fl6);
1571 if (err)
1572 return err;
1574 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1575 length += exthdrlen;
1576 transhdrlen += exthdrlen;
1577 } else {
1578 fl6 = &inet->cork.fl.u.ip6;
1579 transhdrlen = 0;
1582 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1583 &np->cork, sk_page_frag(sk), getfrag,
1584 from, length, transhdrlen, flags, ipc6, sockc);
1586 EXPORT_SYMBOL_GPL(ip6_append_data);
1588 static void ip6_cork_release(struct inet_cork_full *cork,
1589 struct inet6_cork *v6_cork)
1591 if (v6_cork->opt) {
1592 kfree(v6_cork->opt->dst0opt);
1593 kfree(v6_cork->opt->dst1opt);
1594 kfree(v6_cork->opt->hopopt);
1595 kfree(v6_cork->opt->srcrt);
1596 kfree(v6_cork->opt);
1597 v6_cork->opt = NULL;
1600 if (cork->base.dst) {
1601 dst_release(cork->base.dst);
1602 cork->base.dst = NULL;
1603 cork->base.flags &= ~IPCORK_ALLFRAG;
1605 memset(&cork->fl, 0, sizeof(cork->fl));
1608 struct sk_buff *__ip6_make_skb(struct sock *sk,
1609 struct sk_buff_head *queue,
1610 struct inet_cork_full *cork,
1611 struct inet6_cork *v6_cork)
1613 struct sk_buff *skb, *tmp_skb;
1614 struct sk_buff **tail_skb;
1615 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1616 struct ipv6_pinfo *np = inet6_sk(sk);
1617 struct net *net = sock_net(sk);
1618 struct ipv6hdr *hdr;
1619 struct ipv6_txoptions *opt = v6_cork->opt;
1620 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1621 struct flowi6 *fl6 = &cork->fl.u.ip6;
1622 unsigned char proto = fl6->flowi6_proto;
1624 skb = __skb_dequeue(queue);
1625 if (!skb)
1626 goto out;
1627 tail_skb = &(skb_shinfo(skb)->frag_list);
1629 /* move skb->data to ip header from ext header */
1630 if (skb->data < skb_network_header(skb))
1631 __skb_pull(skb, skb_network_offset(skb));
1632 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1633 __skb_pull(tmp_skb, skb_network_header_len(skb));
1634 *tail_skb = tmp_skb;
1635 tail_skb = &(tmp_skb->next);
1636 skb->len += tmp_skb->len;
1637 skb->data_len += tmp_skb->len;
1638 skb->truesize += tmp_skb->truesize;
1639 tmp_skb->destructor = NULL;
1640 tmp_skb->sk = NULL;
1643 /* Allow local fragmentation. */
1644 skb->ignore_df = ip6_sk_ignore_df(sk);
1646 *final_dst = fl6->daddr;
1647 __skb_pull(skb, skb_network_header_len(skb));
1648 if (opt && opt->opt_flen)
1649 ipv6_push_frag_opts(skb, opt, &proto);
1650 if (opt && opt->opt_nflen)
1651 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1653 skb_push(skb, sizeof(struct ipv6hdr));
1654 skb_reset_network_header(skb);
1655 hdr = ipv6_hdr(skb);
1657 ip6_flow_hdr(hdr, v6_cork->tclass,
1658 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1659 ip6_autoflowlabel(net, np), fl6));
1660 hdr->hop_limit = v6_cork->hop_limit;
1661 hdr->nexthdr = proto;
1662 hdr->saddr = fl6->saddr;
1663 hdr->daddr = *final_dst;
1665 skb->priority = sk->sk_priority;
1666 skb->mark = sk->sk_mark;
1668 skb_dst_set(skb, dst_clone(&rt->dst));
1669 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1670 if (proto == IPPROTO_ICMPV6) {
1671 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1673 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1674 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1677 ip6_cork_release(cork, v6_cork);
1678 out:
1679 return skb;
1682 int ip6_send_skb(struct sk_buff *skb)
1684 struct net *net = sock_net(skb->sk);
1685 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1686 int err;
1688 err = ip6_local_out(net, skb->sk, skb);
1689 if (err) {
1690 if (err > 0)
1691 err = net_xmit_errno(err);
1692 if (err)
1693 IP6_INC_STATS(net, rt->rt6i_idev,
1694 IPSTATS_MIB_OUTDISCARDS);
1697 return err;
1700 int ip6_push_pending_frames(struct sock *sk)
1702 struct sk_buff *skb;
1704 skb = ip6_finish_skb(sk);
1705 if (!skb)
1706 return 0;
1708 return ip6_send_skb(skb);
1710 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1712 static void __ip6_flush_pending_frames(struct sock *sk,
1713 struct sk_buff_head *queue,
1714 struct inet_cork_full *cork,
1715 struct inet6_cork *v6_cork)
1717 struct sk_buff *skb;
1719 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1720 if (skb_dst(skb))
1721 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1722 IPSTATS_MIB_OUTDISCARDS);
1723 kfree_skb(skb);
1726 ip6_cork_release(cork, v6_cork);
1729 void ip6_flush_pending_frames(struct sock *sk)
1731 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1732 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1734 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1736 struct sk_buff *ip6_make_skb(struct sock *sk,
1737 int getfrag(void *from, char *to, int offset,
1738 int len, int odd, struct sk_buff *skb),
1739 void *from, int length, int transhdrlen,
1740 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1741 struct rt6_info *rt, unsigned int flags,
1742 const struct sockcm_cookie *sockc)
1744 struct inet_cork_full cork;
1745 struct inet6_cork v6_cork;
1746 struct sk_buff_head queue;
1747 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1748 int err;
1750 if (flags & MSG_PROBE)
1751 return NULL;
1753 __skb_queue_head_init(&queue);
1755 cork.base.flags = 0;
1756 cork.base.addr = 0;
1757 cork.base.opt = NULL;
1758 cork.base.dst = NULL;
1759 v6_cork.opt = NULL;
1760 err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1761 if (err) {
1762 ip6_cork_release(&cork, &v6_cork);
1763 return ERR_PTR(err);
1765 if (ipc6->dontfrag < 0)
1766 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1768 err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1769 &current->task_frag, getfrag, from,
1770 length + exthdrlen, transhdrlen + exthdrlen,
1771 flags, ipc6, sockc);
1772 if (err) {
1773 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1774 return ERR_PTR(err);
1777 return __ip6_make_skb(sk, &queue, &cork, &v6_cork);