dm thin metadata: fix __udivdi3 undefined on 32-bit
[linux/fpc-iii.git] / net / ipv6 / ip6_output.c
blob530b62fd6b646f97f5fa854f1304c802cc48a0a1
1 /*
2 * IPv6 output functions
3 * Linux INET6 implementation
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
45 #include <net/sock.h>
46 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 #include <net/l3mdev.h>
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
62 struct dst_entry *dst = skb_dst(skb);
63 struct net_device *dev = dst->dev;
64 struct neighbour *neigh;
65 struct in6_addr *nexthop;
66 int ret;
68 skb->protocol = htons(ETH_P_IPV6);
69 skb->dev = dev;
71 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
72 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
74 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
75 ((mroute6_socket(net, skb) &&
76 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
77 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
78 &ipv6_hdr(skb)->saddr))) {
79 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
81 /* Do not check for IFF_ALLMULTI; multicast routing
82 is not supported in any case.
84 if (newskb)
85 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
86 net, sk, newskb, NULL, newskb->dev,
87 dev_loopback_xmit);
89 if (ipv6_hdr(skb)->hop_limit == 0) {
90 IP6_INC_STATS(net, idev,
91 IPSTATS_MIB_OUTDISCARDS);
92 kfree_skb(skb);
93 return 0;
97 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
99 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 IPV6_ADDR_SCOPE_NODELOCAL &&
101 !(dev->flags & IFF_LOOPBACK)) {
102 kfree_skb(skb);
103 return 0;
107 rcu_read_lock_bh();
108 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
109 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 if (unlikely(!neigh))
111 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 if (!IS_ERR(neigh)) {
113 ret = dst_neigh_output(dst, neigh, skb);
114 rcu_read_unlock_bh();
115 return ret;
117 rcu_read_unlock_bh();
119 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
120 kfree_skb(skb);
121 return -EINVAL;
124 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
126 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
127 dst_allfrag(skb_dst(skb)) ||
128 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
129 return ip6_fragment(net, sk, skb, ip6_finish_output2);
130 else
131 return ip6_finish_output2(net, sk, skb);
134 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
136 struct net_device *dev = skb_dst(skb)->dev;
137 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139 if (unlikely(idev->cnf.disable_ipv6)) {
140 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
141 kfree_skb(skb);
142 return 0;
145 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
146 net, sk, skb, NULL, dev,
147 ip6_finish_output,
148 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
151 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
153 if (!np->autoflowlabel_set)
154 return ip6_default_np_autolabel(net);
155 else
156 return np->autoflowlabel;
160 * xmit an sk_buff (used by TCP, SCTP and DCCP)
161 * Note : socket lock is not held for SYNACK packets, but might be modified
162 * by calls to skb_set_owner_w() and ipv6_local_error(),
163 * which are using proper atomic operations or spinlocks.
165 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
166 struct ipv6_txoptions *opt, int tclass)
168 struct net *net = sock_net(sk);
169 const struct ipv6_pinfo *np = inet6_sk(sk);
170 struct in6_addr *first_hop = &fl6->daddr;
171 struct dst_entry *dst = skb_dst(skb);
172 struct ipv6hdr *hdr;
173 u8 proto = fl6->flowi6_proto;
174 int seg_len = skb->len;
175 int hlimit = -1;
176 u32 mtu;
178 if (opt) {
179 unsigned int head_room;
181 /* First: exthdrs may take lots of space (~8K for now)
182 MAX_HEADER is not enough.
184 head_room = opt->opt_nflen + opt->opt_flen;
185 seg_len += head_room;
186 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
188 if (skb_headroom(skb) < head_room) {
189 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
190 if (!skb2) {
191 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
192 IPSTATS_MIB_OUTDISCARDS);
193 kfree_skb(skb);
194 return -ENOBUFS;
196 if (skb->sk)
197 skb_set_owner_w(skb2, skb->sk);
198 consume_skb(skb);
199 skb = skb2;
201 if (opt->opt_flen)
202 ipv6_push_frag_opts(skb, opt, &proto);
203 if (opt->opt_nflen)
204 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
207 skb_push(skb, sizeof(struct ipv6hdr));
208 skb_reset_network_header(skb);
209 hdr = ipv6_hdr(skb);
212 * Fill in the IPv6 header
214 if (np)
215 hlimit = np->hop_limit;
216 if (hlimit < 0)
217 hlimit = ip6_dst_hoplimit(dst);
219 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
220 ip6_autoflowlabel(net, np), fl6));
222 hdr->payload_len = htons(seg_len);
223 hdr->nexthdr = proto;
224 hdr->hop_limit = hlimit;
226 hdr->saddr = fl6->saddr;
227 hdr->daddr = *first_hop;
229 skb->protocol = htons(ETH_P_IPV6);
230 skb->priority = sk->sk_priority;
231 skb->mark = sk->sk_mark;
233 mtu = dst_mtu(dst);
234 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
235 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
236 IPSTATS_MIB_OUT, skb->len);
237 /* hooks should never assume socket lock is held.
238 * we promote our socket to non const
240 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
241 net, (struct sock *)sk, skb, NULL, dst->dev,
242 dst_output);
245 skb->dev = dst->dev;
246 /* ipv6_local_error() does not require socket lock,
247 * we promote our socket to non const
249 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
251 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
252 kfree_skb(skb);
253 return -EMSGSIZE;
255 EXPORT_SYMBOL(ip6_xmit);
257 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
259 struct ip6_ra_chain *ra;
260 struct sock *last = NULL;
262 read_lock(&ip6_ra_lock);
263 for (ra = ip6_ra_chain; ra; ra = ra->next) {
264 struct sock *sk = ra->sk;
265 if (sk && ra->sel == sel &&
266 (!sk->sk_bound_dev_if ||
267 sk->sk_bound_dev_if == skb->dev->ifindex)) {
268 if (last) {
269 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
270 if (skb2)
271 rawv6_rcv(last, skb2);
273 last = sk;
277 if (last) {
278 rawv6_rcv(last, skb);
279 read_unlock(&ip6_ra_lock);
280 return 1;
282 read_unlock(&ip6_ra_lock);
283 return 0;
286 static int ip6_forward_proxy_check(struct sk_buff *skb)
288 struct ipv6hdr *hdr = ipv6_hdr(skb);
289 u8 nexthdr = hdr->nexthdr;
290 __be16 frag_off;
291 int offset;
293 if (ipv6_ext_hdr(nexthdr)) {
294 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
295 if (offset < 0)
296 return 0;
297 } else
298 offset = sizeof(struct ipv6hdr);
300 if (nexthdr == IPPROTO_ICMPV6) {
301 struct icmp6hdr *icmp6;
303 if (!pskb_may_pull(skb, (skb_network_header(skb) +
304 offset + 1 - skb->data)))
305 return 0;
307 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
309 switch (icmp6->icmp6_type) {
310 case NDISC_ROUTER_SOLICITATION:
311 case NDISC_ROUTER_ADVERTISEMENT:
312 case NDISC_NEIGHBOUR_SOLICITATION:
313 case NDISC_NEIGHBOUR_ADVERTISEMENT:
314 case NDISC_REDIRECT:
315 /* For reaction involving unicast neighbor discovery
316 * message destined to the proxied address, pass it to
317 * input function.
319 return 1;
320 default:
321 break;
326 * The proxying router can't forward traffic sent to a link-local
327 * address, so signal the sender and discard the packet. This
328 * behavior is clarified by the MIPv6 specification.
330 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
331 dst_link_failure(skb);
332 return -1;
335 return 0;
338 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
339 struct sk_buff *skb)
341 struct dst_entry *dst = skb_dst(skb);
343 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
344 IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
345 skb_sender_cpu_clear(skb);
346 return dst_output(net, sk, skb);
349 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
351 unsigned int mtu;
352 struct inet6_dev *idev;
354 if (dst_metric_locked(dst, RTAX_MTU)) {
355 mtu = dst_metric_raw(dst, RTAX_MTU);
356 if (mtu)
357 return mtu;
360 mtu = IPV6_MIN_MTU;
361 rcu_read_lock();
362 idev = __in6_dev_get(dst->dev);
363 if (idev)
364 mtu = idev->cnf.mtu6;
365 rcu_read_unlock();
367 return mtu;
370 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
372 if (skb->len <= mtu)
373 return false;
375 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
376 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
377 return true;
379 if (skb->ignore_df)
380 return false;
382 if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
383 return false;
385 return true;
388 int ip6_forward(struct sk_buff *skb)
390 struct dst_entry *dst = skb_dst(skb);
391 struct ipv6hdr *hdr = ipv6_hdr(skb);
392 struct inet6_skb_parm *opt = IP6CB(skb);
393 struct net *net = dev_net(dst->dev);
394 u32 mtu;
396 if (net->ipv6.devconf_all->forwarding == 0)
397 goto error;
399 if (skb->pkt_type != PACKET_HOST)
400 goto drop;
402 if (unlikely(skb->sk))
403 goto drop;
405 if (skb_warn_if_lro(skb))
406 goto drop;
408 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
409 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
410 IPSTATS_MIB_INDISCARDS);
411 goto drop;
414 skb_forward_csum(skb);
417 * We DO NOT make any processing on
418 * RA packets, pushing them to user level AS IS
419 * without ane WARRANTY that application will be able
420 * to interpret them. The reason is that we
421 * cannot make anything clever here.
423 * We are not end-node, so that if packet contains
424 * AH/ESP, we cannot make anything.
425 * Defragmentation also would be mistake, RA packets
426 * cannot be fragmented, because there is no warranty
427 * that different fragments will go along one path. --ANK
429 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
430 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
431 return 0;
435 * check and decrement ttl
437 if (hdr->hop_limit <= 1) {
438 /* Force OUTPUT device used as source address */
439 skb->dev = dst->dev;
440 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
441 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
442 IPSTATS_MIB_INHDRERRORS);
444 kfree_skb(skb);
445 return -ETIMEDOUT;
448 /* XXX: idev->cnf.proxy_ndp? */
449 if (net->ipv6.devconf_all->proxy_ndp &&
450 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
451 int proxied = ip6_forward_proxy_check(skb);
452 if (proxied > 0)
453 return ip6_input(skb);
454 else if (proxied < 0) {
455 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
456 IPSTATS_MIB_INDISCARDS);
457 goto drop;
461 if (!xfrm6_route_forward(skb)) {
462 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
463 IPSTATS_MIB_INDISCARDS);
464 goto drop;
466 dst = skb_dst(skb);
468 /* IPv6 specs say nothing about it, but it is clear that we cannot
469 send redirects to source routed frames.
470 We don't send redirects to frames decapsulated from IPsec.
472 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
473 struct in6_addr *target = NULL;
474 struct inet_peer *peer;
475 struct rt6_info *rt;
478 * incoming and outgoing devices are the same
479 * send a redirect.
482 rt = (struct rt6_info *) dst;
483 if (rt->rt6i_flags & RTF_GATEWAY)
484 target = &rt->rt6i_gateway;
485 else
486 target = &hdr->daddr;
488 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
490 /* Limit redirects both by destination (here)
491 and by source (inside ndisc_send_redirect)
493 if (inet_peer_xrlim_allow(peer, 1*HZ))
494 ndisc_send_redirect(skb, target);
495 if (peer)
496 inet_putpeer(peer);
497 } else {
498 int addrtype = ipv6_addr_type(&hdr->saddr);
500 /* This check is security critical. */
501 if (addrtype == IPV6_ADDR_ANY ||
502 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
503 goto error;
504 if (addrtype & IPV6_ADDR_LINKLOCAL) {
505 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
506 ICMPV6_NOT_NEIGHBOUR, 0);
507 goto error;
511 mtu = ip6_dst_mtu_forward(dst);
512 if (mtu < IPV6_MIN_MTU)
513 mtu = IPV6_MIN_MTU;
515 if (ip6_pkt_too_big(skb, mtu)) {
516 /* Again, force OUTPUT device used as source address */
517 skb->dev = dst->dev;
518 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
519 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
520 IPSTATS_MIB_INTOOBIGERRORS);
521 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
522 IPSTATS_MIB_FRAGFAILS);
523 kfree_skb(skb);
524 return -EMSGSIZE;
527 if (skb_cow(skb, dst->dev->hard_header_len)) {
528 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
529 IPSTATS_MIB_OUTDISCARDS);
530 goto drop;
533 hdr = ipv6_hdr(skb);
535 /* Mangling hops number delayed to point after skb COW */
537 hdr->hop_limit--;
539 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
540 net, NULL, skb, skb->dev, dst->dev,
541 ip6_forward_finish);
543 error:
544 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
545 drop:
546 kfree_skb(skb);
547 return -EINVAL;
550 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
552 to->pkt_type = from->pkt_type;
553 to->priority = from->priority;
554 to->protocol = from->protocol;
555 skb_dst_drop(to);
556 skb_dst_set(to, dst_clone(skb_dst(from)));
557 to->dev = from->dev;
558 to->mark = from->mark;
560 skb_copy_hash(to, from);
562 #ifdef CONFIG_NET_SCHED
563 to->tc_index = from->tc_index;
564 #endif
565 nf_copy(to, from);
566 skb_copy_secmark(to, from);
569 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
570 int (*output)(struct net *, struct sock *, struct sk_buff *))
572 struct sk_buff *frag;
573 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
574 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
575 inet6_sk(skb->sk) : NULL;
576 struct ipv6hdr *tmp_hdr;
577 struct frag_hdr *fh;
578 unsigned int mtu, hlen, left, len;
579 int hroom, troom;
580 __be32 frag_id;
581 int ptr, offset = 0, err = 0;
582 u8 *prevhdr, nexthdr = 0;
584 err = ip6_find_1stfragopt(skb, &prevhdr);
585 if (err < 0)
586 goto fail;
587 hlen = err;
588 nexthdr = *prevhdr;
590 mtu = ip6_skb_dst_mtu(skb);
592 /* We must not fragment if the socket is set to force MTU discovery
593 * or if the skb it not generated by a local socket.
595 if (unlikely(!skb->ignore_df && skb->len > mtu))
596 goto fail_toobig;
598 if (IP6CB(skb)->frag_max_size) {
599 if (IP6CB(skb)->frag_max_size > mtu)
600 goto fail_toobig;
602 /* don't send fragments larger than what we received */
603 mtu = IP6CB(skb)->frag_max_size;
604 if (mtu < IPV6_MIN_MTU)
605 mtu = IPV6_MIN_MTU;
608 if (np && np->frag_size < mtu) {
609 if (np->frag_size)
610 mtu = np->frag_size;
612 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
613 goto fail_toobig;
614 mtu -= hlen + sizeof(struct frag_hdr);
616 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
617 &ipv6_hdr(skb)->saddr);
619 if (skb->ip_summed == CHECKSUM_PARTIAL &&
620 (err = skb_checksum_help(skb)))
621 goto fail;
623 hroom = LL_RESERVED_SPACE(rt->dst.dev);
624 if (skb_has_frag_list(skb)) {
625 int first_len = skb_pagelen(skb);
626 struct sk_buff *frag2;
628 if (first_len - hlen > mtu ||
629 ((first_len - hlen) & 7) ||
630 skb_cloned(skb) ||
631 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
632 goto slow_path;
634 skb_walk_frags(skb, frag) {
635 /* Correct geometry. */
636 if (frag->len > mtu ||
637 ((frag->len & 7) && frag->next) ||
638 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
639 goto slow_path_clean;
641 /* Partially cloned skb? */
642 if (skb_shared(frag))
643 goto slow_path_clean;
645 BUG_ON(frag->sk);
646 if (skb->sk) {
647 frag->sk = skb->sk;
648 frag->destructor = sock_wfree;
650 skb->truesize -= frag->truesize;
653 err = 0;
654 offset = 0;
655 /* BUILD HEADER */
657 *prevhdr = NEXTHDR_FRAGMENT;
658 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
659 if (!tmp_hdr) {
660 err = -ENOMEM;
661 goto fail;
663 frag = skb_shinfo(skb)->frag_list;
664 skb_frag_list_init(skb);
666 __skb_pull(skb, hlen);
667 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
668 __skb_push(skb, hlen);
669 skb_reset_network_header(skb);
670 memcpy(skb_network_header(skb), tmp_hdr, hlen);
672 fh->nexthdr = nexthdr;
673 fh->reserved = 0;
674 fh->frag_off = htons(IP6_MF);
675 fh->identification = frag_id;
677 first_len = skb_pagelen(skb);
678 skb->data_len = first_len - skb_headlen(skb);
679 skb->len = first_len;
680 ipv6_hdr(skb)->payload_len = htons(first_len -
681 sizeof(struct ipv6hdr));
683 dst_hold(&rt->dst);
685 for (;;) {
686 /* Prepare header of the next frame,
687 * before previous one went down. */
688 if (frag) {
689 frag->ip_summed = CHECKSUM_NONE;
690 skb_reset_transport_header(frag);
691 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
692 __skb_push(frag, hlen);
693 skb_reset_network_header(frag);
694 memcpy(skb_network_header(frag), tmp_hdr,
695 hlen);
696 offset += skb->len - hlen - sizeof(struct frag_hdr);
697 fh->nexthdr = nexthdr;
698 fh->reserved = 0;
699 fh->frag_off = htons(offset);
700 if (frag->next)
701 fh->frag_off |= htons(IP6_MF);
702 fh->identification = frag_id;
703 ipv6_hdr(frag)->payload_len =
704 htons(frag->len -
705 sizeof(struct ipv6hdr));
706 ip6_copy_metadata(frag, skb);
709 err = output(net, sk, skb);
710 if (!err)
711 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
712 IPSTATS_MIB_FRAGCREATES);
714 if (err || !frag)
715 break;
717 skb = frag;
718 frag = skb->next;
719 skb->next = NULL;
722 kfree(tmp_hdr);
724 if (err == 0) {
725 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
726 IPSTATS_MIB_FRAGOKS);
727 ip6_rt_put(rt);
728 return 0;
731 kfree_skb_list(frag);
733 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
734 IPSTATS_MIB_FRAGFAILS);
735 ip6_rt_put(rt);
736 return err;
738 slow_path_clean:
739 skb_walk_frags(skb, frag2) {
740 if (frag2 == frag)
741 break;
742 frag2->sk = NULL;
743 frag2->destructor = NULL;
744 skb->truesize += frag2->truesize;
748 slow_path:
749 left = skb->len - hlen; /* Space per frame */
750 ptr = hlen; /* Where to start from */
753 * Fragment the datagram.
756 troom = rt->dst.dev->needed_tailroom;
759 * Keep copying data until we run out.
761 while (left > 0) {
762 u8 *fragnexthdr_offset;
764 len = left;
765 /* IF: it doesn't fit, use 'mtu' - the data space left */
766 if (len > mtu)
767 len = mtu;
768 /* IF: we are not sending up to and including the packet end
769 then align the next start on an eight byte boundary */
770 if (len < left) {
771 len &= ~7;
774 /* Allocate buffer */
775 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
776 hroom + troom, GFP_ATOMIC);
777 if (!frag) {
778 err = -ENOMEM;
779 goto fail;
783 * Set up data on packet
786 ip6_copy_metadata(frag, skb);
787 skb_reserve(frag, hroom);
788 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
789 skb_reset_network_header(frag);
790 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
791 frag->transport_header = (frag->network_header + hlen +
792 sizeof(struct frag_hdr));
795 * Charge the memory for the fragment to any owner
796 * it might possess
798 if (skb->sk)
799 skb_set_owner_w(frag, skb->sk);
802 * Copy the packet header into the new buffer.
804 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
806 fragnexthdr_offset = skb_network_header(frag);
807 fragnexthdr_offset += prevhdr - skb_network_header(skb);
808 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
811 * Build fragment header.
813 fh->nexthdr = nexthdr;
814 fh->reserved = 0;
815 fh->identification = frag_id;
818 * Copy a block of the IP datagram.
820 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
821 len));
822 left -= len;
824 fh->frag_off = htons(offset);
825 if (left > 0)
826 fh->frag_off |= htons(IP6_MF);
827 ipv6_hdr(frag)->payload_len = htons(frag->len -
828 sizeof(struct ipv6hdr));
830 ptr += len;
831 offset += len;
834 * Put this fragment into the sending queue.
836 err = output(net, sk, frag);
837 if (err)
838 goto fail;
840 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
841 IPSTATS_MIB_FRAGCREATES);
843 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
844 IPSTATS_MIB_FRAGOKS);
845 consume_skb(skb);
846 return err;
848 fail_toobig:
849 if (skb->sk && dst_allfrag(skb_dst(skb)))
850 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
852 skb->dev = skb_dst(skb)->dev;
853 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
854 err = -EMSGSIZE;
856 fail:
857 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
858 IPSTATS_MIB_FRAGFAILS);
859 kfree_skb(skb);
860 return err;
863 static inline int ip6_rt_check(const struct rt6key *rt_key,
864 const struct in6_addr *fl_addr,
865 const struct in6_addr *addr_cache)
867 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
868 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
871 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
872 struct dst_entry *dst,
873 const struct flowi6 *fl6)
875 struct ipv6_pinfo *np = inet6_sk(sk);
876 struct rt6_info *rt;
878 if (!dst)
879 goto out;
881 if (dst->ops->family != AF_INET6) {
882 dst_release(dst);
883 return NULL;
886 rt = (struct rt6_info *)dst;
887 /* Yes, checking route validity in not connected
888 * case is not very simple. Take into account,
889 * that we do not support routing by source, TOS,
890 * and MSG_DONTROUTE --ANK (980726)
892 * 1. ip6_rt_check(): If route was host route,
893 * check that cached destination is current.
894 * If it is network route, we still may
895 * check its validity using saved pointer
896 * to the last used address: daddr_cache.
897 * We do not want to save whole address now,
898 * (because main consumer of this service
899 * is tcp, which has not this problem),
900 * so that the last trick works only on connected
901 * sockets.
902 * 2. oif also should be the same.
904 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
905 #ifdef CONFIG_IPV6_SUBTREES
906 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
907 #endif
908 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
909 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
910 dst_release(dst);
911 dst = NULL;
914 out:
915 return dst;
918 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
919 struct dst_entry **dst, struct flowi6 *fl6)
921 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
922 struct neighbour *n;
923 struct rt6_info *rt;
924 #endif
925 int err;
926 int flags = 0;
928 /* The correct way to handle this would be to do
929 * ip6_route_get_saddr, and then ip6_route_output; however,
930 * the route-specific preferred source forces the
931 * ip6_route_output call _before_ ip6_route_get_saddr.
933 * In source specific routing (no src=any default route),
934 * ip6_route_output will fail given src=any saddr, though, so
935 * that's why we try it again later.
937 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
938 struct rt6_info *rt;
939 bool had_dst = *dst != NULL;
941 if (!had_dst)
942 *dst = ip6_route_output(net, sk, fl6);
943 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
944 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
945 sk ? inet6_sk(sk)->srcprefs : 0,
946 &fl6->saddr);
947 if (err)
948 goto out_err_release;
950 /* If we had an erroneous initial result, pretend it
951 * never existed and let the SA-enabled version take
952 * over.
954 if (!had_dst && (*dst)->error) {
955 dst_release(*dst);
956 *dst = NULL;
959 if (fl6->flowi6_oif)
960 flags |= RT6_LOOKUP_F_IFACE;
963 if (!*dst)
964 *dst = ip6_route_output_flags(net, sk, fl6, flags);
966 err = (*dst)->error;
967 if (err)
968 goto out_err_release;
970 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
972 * Here if the dst entry we've looked up
973 * has a neighbour entry that is in the INCOMPLETE
974 * state and the src address from the flow is
975 * marked as OPTIMISTIC, we release the found
976 * dst entry and replace it instead with the
977 * dst entry of the nexthop router
979 rt = (struct rt6_info *) *dst;
980 rcu_read_lock_bh();
981 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
982 rt6_nexthop(rt, &fl6->daddr));
983 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
984 rcu_read_unlock_bh();
986 if (err) {
987 struct inet6_ifaddr *ifp;
988 struct flowi6 fl_gw6;
989 int redirect;
991 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
992 (*dst)->dev, 1);
994 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
995 if (ifp)
996 in6_ifa_put(ifp);
998 if (redirect) {
1000 * We need to get the dst entry for the
1001 * default router instead
1003 dst_release(*dst);
1004 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1005 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1006 *dst = ip6_route_output(net, sk, &fl_gw6);
1007 err = (*dst)->error;
1008 if (err)
1009 goto out_err_release;
1012 #endif
1013 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1014 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1015 err = -EAFNOSUPPORT;
1016 goto out_err_release;
1019 return 0;
1021 out_err_release:
1022 if (err == -ENETUNREACH)
1023 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1024 dst_release(*dst);
1025 *dst = NULL;
1026 return err;
1030 * ip6_dst_lookup - perform route lookup on flow
1031 * @sk: socket which provides route info
1032 * @dst: pointer to dst_entry * for result
1033 * @fl6: flow to lookup
1035 * This function performs a route lookup on the given flow.
1037 * It returns zero on success, or a standard errno code on error.
1039 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1040 struct flowi6 *fl6)
1042 *dst = NULL;
1043 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1045 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1048 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1049 * @sk: socket which provides route info
1050 * @fl6: flow to lookup
1051 * @final_dst: final destination address for ipsec lookup
1053 * This function performs a route lookup on the given flow.
1055 * It returns a valid dst pointer on success, or a pointer encoded
1056 * error code.
1058 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1059 const struct in6_addr *final_dst)
1061 struct dst_entry *dst = NULL;
1062 int err;
1064 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1065 if (err)
1066 return ERR_PTR(err);
1067 if (final_dst)
1068 fl6->daddr = *final_dst;
1069 if (!fl6->flowi6_oif)
1070 fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
1072 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1074 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1077 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1078 * @sk: socket which provides the dst cache and route info
1079 * @fl6: flow to lookup
1080 * @final_dst: final destination address for ipsec lookup
1082 * This function performs a route lookup on the given flow with the
1083 * possibility of using the cached route in the socket if it is valid.
1084 * It will take the socket dst lock when operating on the dst cache.
1085 * As a result, this function can only be used in process context.
1087 * It returns a valid dst pointer on success, or a pointer encoded
1088 * error code.
1090 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1091 const struct in6_addr *final_dst)
1093 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1095 dst = ip6_sk_dst_check(sk, dst, fl6);
1096 if (!dst)
1097 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1099 return dst;
1101 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1103 static inline int ip6_ufo_append_data(struct sock *sk,
1104 struct sk_buff_head *queue,
1105 int getfrag(void *from, char *to, int offset, int len,
1106 int odd, struct sk_buff *skb),
1107 void *from, int length, int hh_len, int fragheaderlen,
1108 int exthdrlen, int transhdrlen, int mtu,
1109 unsigned int flags, const struct flowi6 *fl6)
1112 struct sk_buff *skb;
1113 int err;
1115 /* There is support for UDP large send offload by network
1116 * device, so create one single skb packet containing complete
1117 * udp datagram
1119 skb = skb_peek_tail(queue);
1120 if (!skb) {
1121 skb = sock_alloc_send_skb(sk,
1122 hh_len + fragheaderlen + transhdrlen + 20,
1123 (flags & MSG_DONTWAIT), &err);
1124 if (!skb)
1125 return err;
1127 /* reserve space for Hardware header */
1128 skb_reserve(skb, hh_len);
1130 /* create space for UDP/IP header */
1131 skb_put(skb, fragheaderlen + transhdrlen);
1133 /* initialize network header pointer */
1134 skb_set_network_header(skb, exthdrlen);
1136 /* initialize protocol header pointer */
1137 skb->transport_header = skb->network_header + fragheaderlen;
1139 skb->protocol = htons(ETH_P_IPV6);
1140 skb->csum = 0;
1142 __skb_queue_tail(queue, skb);
1143 } else if (skb_is_gso(skb)) {
1144 goto append;
1147 skb->ip_summed = CHECKSUM_PARTIAL;
1148 /* Specify the length of each IPv6 datagram fragment.
1149 * It has to be a multiple of 8.
1151 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1152 sizeof(struct frag_hdr)) & ~7;
1153 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1154 skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1155 &fl6->daddr,
1156 &fl6->saddr);
1158 append:
1159 return skb_append_datato_frags(sk, skb, getfrag, from,
1160 (length - transhdrlen));
1163 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1164 gfp_t gfp)
1166 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1169 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1170 gfp_t gfp)
1172 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1175 static void ip6_append_data_mtu(unsigned int *mtu,
1176 int *maxfraglen,
1177 unsigned int fragheaderlen,
1178 struct sk_buff *skb,
1179 struct rt6_info *rt,
1180 unsigned int orig_mtu)
1182 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1183 if (!skb) {
1184 /* first fragment, reserve header_len */
1185 *mtu = orig_mtu - rt->dst.header_len;
1187 } else {
1189 * this fragment is not first, the headers
1190 * space is regarded as data space.
1192 *mtu = orig_mtu;
1194 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1195 + fragheaderlen - sizeof(struct frag_hdr);
1199 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1200 struct inet6_cork *v6_cork,
1201 int hlimit, int tclass, struct ipv6_txoptions *opt,
1202 struct rt6_info *rt, struct flowi6 *fl6)
1204 struct ipv6_pinfo *np = inet6_sk(sk);
1205 unsigned int mtu;
1208 * setup for corking
1210 if (opt) {
1211 if (WARN_ON(v6_cork->opt))
1212 return -EINVAL;
1214 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1215 if (unlikely(!v6_cork->opt))
1216 return -ENOBUFS;
1218 v6_cork->opt->tot_len = sizeof(*opt);
1219 v6_cork->opt->opt_flen = opt->opt_flen;
1220 v6_cork->opt->opt_nflen = opt->opt_nflen;
1222 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1223 sk->sk_allocation);
1224 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1225 return -ENOBUFS;
1227 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1228 sk->sk_allocation);
1229 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1230 return -ENOBUFS;
1232 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1233 sk->sk_allocation);
1234 if (opt->hopopt && !v6_cork->opt->hopopt)
1235 return -ENOBUFS;
1237 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1238 sk->sk_allocation);
1239 if (opt->srcrt && !v6_cork->opt->srcrt)
1240 return -ENOBUFS;
1242 /* need source address above miyazawa*/
1244 dst_hold(&rt->dst);
1245 cork->base.dst = &rt->dst;
1246 cork->fl.u.ip6 = *fl6;
1247 v6_cork->hop_limit = hlimit;
1248 v6_cork->tclass = tclass;
1249 if (rt->dst.flags & DST_XFRM_TUNNEL)
1250 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1251 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1252 else
1253 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1254 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
1255 if (np->frag_size < mtu) {
1256 if (np->frag_size)
1257 mtu = np->frag_size;
1259 if (mtu < IPV6_MIN_MTU)
1260 return -EINVAL;
1261 cork->base.fragsize = mtu;
1262 if (dst_allfrag(rt->dst.path))
1263 cork->base.flags |= IPCORK_ALLFRAG;
1264 cork->base.length = 0;
1266 return 0;
1269 static int __ip6_append_data(struct sock *sk,
1270 struct flowi6 *fl6,
1271 struct sk_buff_head *queue,
1272 struct inet_cork *cork,
1273 struct inet6_cork *v6_cork,
1274 struct page_frag *pfrag,
1275 int getfrag(void *from, char *to, int offset,
1276 int len, int odd, struct sk_buff *skb),
1277 void *from, int length, int transhdrlen,
1278 unsigned int flags, int dontfrag)
1280 struct sk_buff *skb, *skb_prev = NULL;
1281 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1282 int exthdrlen = 0;
1283 int dst_exthdrlen = 0;
1284 int hh_len;
1285 int copy;
1286 int err;
1287 int offset = 0;
1288 __u8 tx_flags = 0;
1289 u32 tskey = 0;
1290 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1291 struct ipv6_txoptions *opt = v6_cork->opt;
1292 int csummode = CHECKSUM_NONE;
1293 unsigned int maxnonfragsize, headersize;
1295 skb = skb_peek_tail(queue);
1296 if (!skb) {
1297 exthdrlen = opt ? opt->opt_flen : 0;
1298 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1301 mtu = cork->fragsize;
1302 orig_mtu = mtu;
1304 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1306 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1307 (opt ? opt->opt_nflen : 0);
1308 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1309 sizeof(struct frag_hdr);
1311 headersize = sizeof(struct ipv6hdr) +
1312 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1313 (dst_allfrag(&rt->dst) ?
1314 sizeof(struct frag_hdr) : 0) +
1315 rt->rt6i_nfheader_len;
1317 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1318 * the first fragment
1320 if (headersize + transhdrlen > mtu)
1321 goto emsgsize;
1323 if (cork->length + length > mtu - headersize && dontfrag &&
1324 (sk->sk_protocol == IPPROTO_UDP ||
1325 sk->sk_protocol == IPPROTO_RAW)) {
1326 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1327 sizeof(struct ipv6hdr));
1328 goto emsgsize;
1331 if (ip6_sk_ignore_df(sk))
1332 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1333 else
1334 maxnonfragsize = mtu;
1336 if (cork->length + length > maxnonfragsize - headersize) {
1337 emsgsize:
1338 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1339 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1340 return -EMSGSIZE;
1343 /* CHECKSUM_PARTIAL only with no extension headers and when
1344 * we are not going to fragment
1346 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1347 headersize == sizeof(struct ipv6hdr) &&
1348 length < mtu - headersize &&
1349 !(flags & MSG_MORE) &&
1350 rt->dst.dev->features & NETIF_F_V6_CSUM)
1351 csummode = CHECKSUM_PARTIAL;
1353 if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1354 sock_tx_timestamp(sk, &tx_flags);
1355 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1356 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1357 tskey = sk->sk_tskey++;
1361 * Let's try using as much space as possible.
1362 * Use MTU if total length of the message fits into the MTU.
1363 * Otherwise, we need to reserve fragment header and
1364 * fragment alignment (= 8-15 octects, in total).
1366 * Note that we may need to "move" the data from the tail of
1367 * of the buffer to the new fragment when we split
1368 * the message.
1370 * FIXME: It may be fragmented into multiple chunks
1371 * at once if non-fragmentable extension headers
1372 * are too large.
1373 * --yoshfuji
1376 cork->length += length;
1377 if ((skb && skb_is_gso(skb)) ||
1378 (((length + (skb ? skb->len : headersize)) > mtu) &&
1379 (skb_queue_len(queue) <= 1) &&
1380 (sk->sk_protocol == IPPROTO_UDP) &&
1381 (rt->dst.dev->features & NETIF_F_UFO) &&
1382 (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk))) {
1383 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1384 hh_len, fragheaderlen, exthdrlen,
1385 transhdrlen, mtu, flags, fl6);
1386 if (err)
1387 goto error;
1388 return 0;
1391 if (!skb)
1392 goto alloc_new_skb;
1394 while (length > 0) {
1395 /* Check if the remaining data fits into current packet. */
1396 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1397 if (copy < length)
1398 copy = maxfraglen - skb->len;
1400 if (copy <= 0) {
1401 char *data;
1402 unsigned int datalen;
1403 unsigned int fraglen;
1404 unsigned int fraggap;
1405 unsigned int alloclen;
1406 alloc_new_skb:
1407 /* There's no room in the current skb */
1408 if (skb)
1409 fraggap = skb->len - maxfraglen;
1410 else
1411 fraggap = 0;
1412 /* update mtu and maxfraglen if necessary */
1413 if (!skb || !skb_prev)
1414 ip6_append_data_mtu(&mtu, &maxfraglen,
1415 fragheaderlen, skb, rt,
1416 orig_mtu);
1418 skb_prev = skb;
1421 * If remaining data exceeds the mtu,
1422 * we know we need more fragment(s).
1424 datalen = length + fraggap;
1426 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1427 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1428 if ((flags & MSG_MORE) &&
1429 !(rt->dst.dev->features&NETIF_F_SG))
1430 alloclen = mtu;
1431 else
1432 alloclen = datalen + fragheaderlen;
1434 alloclen += dst_exthdrlen;
1436 if (datalen != length + fraggap) {
1438 * this is not the last fragment, the trailer
1439 * space is regarded as data space.
1441 datalen += rt->dst.trailer_len;
1444 alloclen += rt->dst.trailer_len;
1445 fraglen = datalen + fragheaderlen;
1448 * We just reserve space for fragment header.
1449 * Note: this may be overallocation if the message
1450 * (without MSG_MORE) fits into the MTU.
1452 alloclen += sizeof(struct frag_hdr);
1454 copy = datalen - transhdrlen - fraggap;
1455 if (copy < 0) {
1456 err = -EINVAL;
1457 goto error;
1459 if (transhdrlen) {
1460 skb = sock_alloc_send_skb(sk,
1461 alloclen + hh_len,
1462 (flags & MSG_DONTWAIT), &err);
1463 } else {
1464 skb = NULL;
1465 if (atomic_read(&sk->sk_wmem_alloc) <=
1466 2 * sk->sk_sndbuf)
1467 skb = sock_wmalloc(sk,
1468 alloclen + hh_len, 1,
1469 sk->sk_allocation);
1470 if (unlikely(!skb))
1471 err = -ENOBUFS;
1473 if (!skb)
1474 goto error;
1476 * Fill in the control structures
1478 skb->protocol = htons(ETH_P_IPV6);
1479 skb->ip_summed = csummode;
1480 skb->csum = 0;
1481 /* reserve for fragmentation and ipsec header */
1482 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1483 dst_exthdrlen);
1485 /* Only the initial fragment is time stamped */
1486 skb_shinfo(skb)->tx_flags = tx_flags;
1487 tx_flags = 0;
1488 skb_shinfo(skb)->tskey = tskey;
1489 tskey = 0;
1492 * Find where to start putting bytes
1494 data = skb_put(skb, fraglen);
1495 skb_set_network_header(skb, exthdrlen);
1496 data += fragheaderlen;
1497 skb->transport_header = (skb->network_header +
1498 fragheaderlen);
1499 if (fraggap) {
1500 skb->csum = skb_copy_and_csum_bits(
1501 skb_prev, maxfraglen,
1502 data + transhdrlen, fraggap, 0);
1503 skb_prev->csum = csum_sub(skb_prev->csum,
1504 skb->csum);
1505 data += fraggap;
1506 pskb_trim_unique(skb_prev, maxfraglen);
1508 if (copy > 0 &&
1509 getfrag(from, data + transhdrlen, offset,
1510 copy, fraggap, skb) < 0) {
1511 err = -EFAULT;
1512 kfree_skb(skb);
1513 goto error;
1516 offset += copy;
1517 length -= datalen - fraggap;
1518 transhdrlen = 0;
1519 exthdrlen = 0;
1520 dst_exthdrlen = 0;
1523 * Put the packet on the pending queue
1525 __skb_queue_tail(queue, skb);
1526 continue;
1529 if (copy > length)
1530 copy = length;
1532 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1533 skb_tailroom(skb) >= copy) {
1534 unsigned int off;
1536 off = skb->len;
1537 if (getfrag(from, skb_put(skb, copy),
1538 offset, copy, off, skb) < 0) {
1539 __skb_trim(skb, off);
1540 err = -EFAULT;
1541 goto error;
1543 } else {
1544 int i = skb_shinfo(skb)->nr_frags;
1546 err = -ENOMEM;
1547 if (!sk_page_frag_refill(sk, pfrag))
1548 goto error;
1550 if (!skb_can_coalesce(skb, i, pfrag->page,
1551 pfrag->offset)) {
1552 err = -EMSGSIZE;
1553 if (i == MAX_SKB_FRAGS)
1554 goto error;
1556 __skb_fill_page_desc(skb, i, pfrag->page,
1557 pfrag->offset, 0);
1558 skb_shinfo(skb)->nr_frags = ++i;
1559 get_page(pfrag->page);
1561 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1562 if (getfrag(from,
1563 page_address(pfrag->page) + pfrag->offset,
1564 offset, copy, skb->len, skb) < 0)
1565 goto error_efault;
1567 pfrag->offset += copy;
1568 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1569 skb->len += copy;
1570 skb->data_len += copy;
1571 skb->truesize += copy;
1572 atomic_add(copy, &sk->sk_wmem_alloc);
1574 offset += copy;
1575 length -= copy;
1578 return 0;
1580 error_efault:
1581 err = -EFAULT;
1582 error:
1583 cork->length -= length;
1584 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1585 return err;
1588 int ip6_append_data(struct sock *sk,
1589 int getfrag(void *from, char *to, int offset, int len,
1590 int odd, struct sk_buff *skb),
1591 void *from, int length, int transhdrlen, int hlimit,
1592 int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1593 struct rt6_info *rt, unsigned int flags, int dontfrag)
1595 struct inet_sock *inet = inet_sk(sk);
1596 struct ipv6_pinfo *np = inet6_sk(sk);
1597 int exthdrlen;
1598 int err;
1600 if (flags&MSG_PROBE)
1601 return 0;
1602 if (skb_queue_empty(&sk->sk_write_queue)) {
1604 * setup for corking
1606 err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1607 tclass, opt, rt, fl6);
1608 if (err)
1609 return err;
1611 exthdrlen = (opt ? opt->opt_flen : 0);
1612 length += exthdrlen;
1613 transhdrlen += exthdrlen;
1614 } else {
1615 fl6 = &inet->cork.fl.u.ip6;
1616 transhdrlen = 0;
1619 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1620 &np->cork, sk_page_frag(sk), getfrag,
1621 from, length, transhdrlen, flags, dontfrag);
1623 EXPORT_SYMBOL_GPL(ip6_append_data);
1625 static void ip6_cork_release(struct inet_cork_full *cork,
1626 struct inet6_cork *v6_cork)
1628 if (v6_cork->opt) {
1629 kfree(v6_cork->opt->dst0opt);
1630 kfree(v6_cork->opt->dst1opt);
1631 kfree(v6_cork->opt->hopopt);
1632 kfree(v6_cork->opt->srcrt);
1633 kfree(v6_cork->opt);
1634 v6_cork->opt = NULL;
1637 if (cork->base.dst) {
1638 dst_release(cork->base.dst);
1639 cork->base.dst = NULL;
1640 cork->base.flags &= ~IPCORK_ALLFRAG;
1642 memset(&cork->fl, 0, sizeof(cork->fl));
1645 struct sk_buff *__ip6_make_skb(struct sock *sk,
1646 struct sk_buff_head *queue,
1647 struct inet_cork_full *cork,
1648 struct inet6_cork *v6_cork)
1650 struct sk_buff *skb, *tmp_skb;
1651 struct sk_buff **tail_skb;
1652 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1653 struct ipv6_pinfo *np = inet6_sk(sk);
1654 struct net *net = sock_net(sk);
1655 struct ipv6hdr *hdr;
1656 struct ipv6_txoptions *opt = v6_cork->opt;
1657 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1658 struct flowi6 *fl6 = &cork->fl.u.ip6;
1659 unsigned char proto = fl6->flowi6_proto;
1661 skb = __skb_dequeue(queue);
1662 if (!skb)
1663 goto out;
1664 tail_skb = &(skb_shinfo(skb)->frag_list);
1666 /* move skb->data to ip header from ext header */
1667 if (skb->data < skb_network_header(skb))
1668 __skb_pull(skb, skb_network_offset(skb));
1669 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1670 __skb_pull(tmp_skb, skb_network_header_len(skb));
1671 *tail_skb = tmp_skb;
1672 tail_skb = &(tmp_skb->next);
1673 skb->len += tmp_skb->len;
1674 skb->data_len += tmp_skb->len;
1675 skb->truesize += tmp_skb->truesize;
1676 tmp_skb->destructor = NULL;
1677 tmp_skb->sk = NULL;
1680 /* Allow local fragmentation. */
1681 skb->ignore_df = ip6_sk_ignore_df(sk);
1683 *final_dst = fl6->daddr;
1684 __skb_pull(skb, skb_network_header_len(skb));
1685 if (opt && opt->opt_flen)
1686 ipv6_push_frag_opts(skb, opt, &proto);
1687 if (opt && opt->opt_nflen)
1688 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1690 skb_push(skb, sizeof(struct ipv6hdr));
1691 skb_reset_network_header(skb);
1692 hdr = ipv6_hdr(skb);
1694 ip6_flow_hdr(hdr, v6_cork->tclass,
1695 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1696 ip6_autoflowlabel(net, np), fl6));
1697 hdr->hop_limit = v6_cork->hop_limit;
1698 hdr->nexthdr = proto;
1699 hdr->saddr = fl6->saddr;
1700 hdr->daddr = *final_dst;
1702 skb->priority = sk->sk_priority;
1703 skb->mark = sk->sk_mark;
1705 skb_dst_set(skb, dst_clone(&rt->dst));
1706 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1707 if (proto == IPPROTO_ICMPV6) {
1708 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1710 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1711 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1714 ip6_cork_release(cork, v6_cork);
1715 out:
1716 return skb;
1719 int ip6_send_skb(struct sk_buff *skb)
1721 struct net *net = sock_net(skb->sk);
1722 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1723 int err;
1725 err = ip6_local_out(net, skb->sk, skb);
1726 if (err) {
1727 if (err > 0)
1728 err = net_xmit_errno(err);
1729 if (err)
1730 IP6_INC_STATS(net, rt->rt6i_idev,
1731 IPSTATS_MIB_OUTDISCARDS);
1734 return err;
1737 int ip6_push_pending_frames(struct sock *sk)
1739 struct sk_buff *skb;
1741 skb = ip6_finish_skb(sk);
1742 if (!skb)
1743 return 0;
1745 return ip6_send_skb(skb);
1747 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1749 static void __ip6_flush_pending_frames(struct sock *sk,
1750 struct sk_buff_head *queue,
1751 struct inet_cork_full *cork,
1752 struct inet6_cork *v6_cork)
1754 struct sk_buff *skb;
1756 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1757 if (skb_dst(skb))
1758 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1759 IPSTATS_MIB_OUTDISCARDS);
1760 kfree_skb(skb);
1763 ip6_cork_release(cork, v6_cork);
1766 void ip6_flush_pending_frames(struct sock *sk)
1768 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1769 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1771 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1773 struct sk_buff *ip6_make_skb(struct sock *sk,
1774 int getfrag(void *from, char *to, int offset,
1775 int len, int odd, struct sk_buff *skb),
1776 void *from, int length, int transhdrlen,
1777 int hlimit, int tclass,
1778 struct ipv6_txoptions *opt, struct flowi6 *fl6,
1779 struct rt6_info *rt, unsigned int flags,
1780 int dontfrag)
1782 struct inet_cork_full cork;
1783 struct inet6_cork v6_cork;
1784 struct sk_buff_head queue;
1785 int exthdrlen = (opt ? opt->opt_flen : 0);
1786 int err;
1788 if (flags & MSG_PROBE)
1789 return NULL;
1791 __skb_queue_head_init(&queue);
1793 cork.base.flags = 0;
1794 cork.base.addr = 0;
1795 cork.base.opt = NULL;
1796 cork.base.dst = NULL;
1797 v6_cork.opt = NULL;
1798 err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1799 if (err) {
1800 ip6_cork_release(&cork, &v6_cork);
1801 return ERR_PTR(err);
1804 if (dontfrag < 0)
1805 dontfrag = inet6_sk(sk)->dontfrag;
1807 err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1808 &current->task_frag, getfrag, from,
1809 length + exthdrlen, transhdrlen + exthdrlen,
1810 flags, dontfrag);
1811 if (err) {
1812 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1813 return ERR_PTR(err);
1816 return __ip6_make_skb(sk, &queue, &cork, &v6_cork);