net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132 {
 133         int ret;
 134
 135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136         if (ret) {
 137                 kfree_skb(skb);
 138                 return ret;
 139         }
 140
 141         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 142             dst_allfrag(skb_dst(skb)) ||
 143             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 144                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 145         else
 146                 return ip6_finish_output2(net, sk, skb);
 147 }
 148
 149 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 150 {
 151         struct net_device *dev = skb_dst(skb)->dev;
 152         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 153
 154         skb->protocol = htons(ETH_P_IPV6);
 155         skb->dev = dev;
 156
 157         if (unlikely(idev->cnf.disable_ipv6)) {
 158                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 159                 kfree_skb(skb);
 160                 return 0;
 161         }
 162
 163         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 164                             net, sk, skb, NULL, dev,
 165                             ip6_finish_output,
 166                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 167 }
 168
 169 /*
 170  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 171  * Note : socket lock is not held for SYNACK packets, but might be modified
 172  * by calls to skb_set_owner_w() and ipv6_local_error(),
 173  * which are using proper atomic operations or spinlocks.
 174  */
 175 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 176              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 177 {
 178         struct net *net = sock_net(sk);
 179         const struct ipv6_pinfo *np = inet6_sk(sk);
 180         struct in6_addr *first_hop = &fl6->daddr;
 181         struct dst_entry *dst = skb_dst(skb);
 182         struct ipv6hdr *hdr;
 183         u8  proto = fl6->flowi6_proto;
 184         int seg_len = skb->len;
 185         int hlimit = -1;
 186         u32 mtu;
 187
 188         if (opt) {
 189                 unsigned int head_room;
 190
 191                 /* First: exthdrs may take lots of space (~8K for now)
 192                    MAX_HEADER is not enough.
 193                  */
 194                 head_room = opt->opt_nflen + opt->opt_flen;
 195                 seg_len += head_room;
 196                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 197
 198                 if (skb_headroom(skb) < head_room) {
 199                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 200                         if (!skb2) {
 201                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 202                                               IPSTATS_MIB_OUTDISCARDS);
 203                                 kfree_skb(skb);
 204                                 return -ENOBUFS;
 205                         }
 206                         consume_skb(skb);
 207                         skb = skb2;
 208                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 209                          * it is safe to call in our context (socket lock not held)
 210                          */
 211                         skb_set_owner_w(skb, (struct sock *)sk);
 212                 }
 213                 if (opt->opt_flen)
 214                         ipv6_push_frag_opts(skb, opt, &proto);
 215                 if (opt->opt_nflen)
 216                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 217                                              &fl6->saddr);
 218         }
 219
 220         skb_push(skb, sizeof(struct ipv6hdr));
 221         skb_reset_network_header(skb);
 222         hdr = ipv6_hdr(skb);
 223
 224         /*
 225          *      Fill in the IPv6 header
 226          */
 227         if (np)
 228                 hlimit = np->hop_limit;
 229         if (hlimit < 0)
 230                 hlimit = ip6_dst_hoplimit(dst);
 231
 232         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 233                                                      np->autoflowlabel, fl6));
 234
 235         hdr->payload_len = htons(seg_len);
 236         hdr->nexthdr = proto;
 237         hdr->hop_limit = hlimit;
 238
 239         hdr->saddr = fl6->saddr;
 240         hdr->daddr = *first_hop;
 241
 242         skb->protocol = htons(ETH_P_IPV6);
 243         skb->priority = sk->sk_priority;
 244         skb->mark = mark;
 245
 246         mtu = dst_mtu(dst);
 247         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 248                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 249                               IPSTATS_MIB_OUT, skb->len);
 250
 251                 /* if egress device is enslaved to an L3 master device pass the
 252                  * skb to its handler for processing
 253                  */
 254                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 255                 if (unlikely(!skb))
 256                         return 0;
 257
 258                 /* hooks should never assume socket lock is held.
 259                  * we promote our socket to non const
 260                  */
 261                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 262                                net, (struct sock *)sk, skb, NULL, dst->dev,
 263                                dst_output);
 264         }
 265
 266         skb->dev = dst->dev;
 267         /* ipv6_local_error() does not require socket lock,
 268          * we promote our socket to non const
 269          */
 270         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 271
 272         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 273         kfree_skb(skb);
 274         return -EMSGSIZE;
 275 }
 276 EXPORT_SYMBOL(ip6_xmit);
 277
 278 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 279 {
 280         struct ip6_ra_chain *ra;
 281         struct sock *last = NULL;
 282
 283         read_lock(&ip6_ra_lock);
 284         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 285                 struct sock *sk = ra->sk;
 286                 if (sk && ra->sel == sel &&
 287                     (!sk->sk_bound_dev_if ||
 288                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 289                         if (last) {
 290                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 291                                 if (skb2)
 292                                         rawv6_rcv(last, skb2);
 293                         }
 294                         last = sk;
 295                 }
 296         }
 297
 298         if (last) {
 299                 rawv6_rcv(last, skb);
 300                 read_unlock(&ip6_ra_lock);
 301                 return 1;
 302         }
 303         read_unlock(&ip6_ra_lock);
 304         return 0;
 305 }
 306
 307 static int ip6_forward_proxy_check(struct sk_buff *skb)
 308 {
 309         struct ipv6hdr *hdr = ipv6_hdr(skb);
 310         u8 nexthdr = hdr->nexthdr;
 311         __be16 frag_off;
 312         int offset;
 313
 314         if (ipv6_ext_hdr(nexthdr)) {
 315                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 316                 if (offset < 0)
 317                         return 0;
 318         } else
 319                 offset = sizeof(struct ipv6hdr);
 320
 321         if (nexthdr == IPPROTO_ICMPV6) {
 322                 struct icmp6hdr *icmp6;
 323
 324                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 325                                          offset + 1 - skb->data)))
 326                         return 0;
 327
 328                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 329
 330                 switch (icmp6->icmp6_type) {
 331                 case NDISC_ROUTER_SOLICITATION:
 332                 case NDISC_ROUTER_ADVERTISEMENT:
 333                 case NDISC_NEIGHBOUR_SOLICITATION:
 334                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 335                 case NDISC_REDIRECT:
 336                         /* For reaction involving unicast neighbor discovery
 337                          * message destined to the proxied address, pass it to
 338                          * input function.
 339                          */
 340                         return 1;
 341                 default:
 342                         break;
 343                 }
 344         }
 345
 346         /*
 347          * The proxying router can't forward traffic sent to a link-local
 348          * address, so signal the sender and discard the packet. This
 349          * behavior is clarified by the MIPv6 specification.
 350          */
 351         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 352                 dst_link_failure(skb);
 353                 return -1;
 354         }
 355
 356         return 0;
 357 }
 358
 359 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 360                                      struct sk_buff *skb)
 361 {
 362         return dst_output(net, sk, skb);
 363 }
 364
 365 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 366 {
 367         unsigned int mtu;
 368         struct inet6_dev *idev;
 369
 370         if (dst_metric_locked(dst, RTAX_MTU)) {
 371                 mtu = dst_metric_raw(dst, RTAX_MTU);
 372                 if (mtu)
 373                         return mtu;
 374         }
 375
 376         mtu = IPV6_MIN_MTU;
 377         rcu_read_lock();
 378         idev = __in6_dev_get(dst->dev);
 379         if (idev)
 380                 mtu = idev->cnf.mtu6;
 381         rcu_read_unlock();
 382
 383         return mtu;
 384 }
 385
 386 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 387 {
 388         if (skb->len <= mtu)
 389                 return false;
 390
 391         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 392         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 393                 return true;
 394
 395         if (skb->ignore_df)
 396                 return false;
 397
 398         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
 399                 return false;
 400
 401         return true;
 402 }
 403
 404 int ip6_forward(struct sk_buff *skb)
 405 {
 406         struct dst_entry *dst = skb_dst(skb);
 407         struct ipv6hdr *hdr = ipv6_hdr(skb);
 408         struct inet6_skb_parm *opt = IP6CB(skb);
 409         struct net *net = dev_net(dst->dev);
 410         u32 mtu;
 411
 412         if (net->ipv6.devconf_all->forwarding == 0)
 413                 goto error;
 414
 415         if (skb->pkt_type != PACKET_HOST)
 416                 goto drop;
 417
 418         if (unlikely(skb->sk))
 419                 goto drop;
 420
 421         if (skb_warn_if_lro(skb))
 422                 goto drop;
 423
 424         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 425                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 426                                 IPSTATS_MIB_INDISCARDS);
 427                 goto drop;
 428         }
 429
 430         skb_forward_csum(skb);
 431
 432         /*
 433          *      We DO NOT make any processing on
 434          *      RA packets, pushing them to user level AS IS
 435          *      without ane WARRANTY that application will be able
 436          *      to interpret them. The reason is that we
 437          *      cannot make anything clever here.
 438          *
 439          *      We are not end-node, so that if packet contains
 440          *      AH/ESP, we cannot make anything.
 441          *      Defragmentation also would be mistake, RA packets
 442          *      cannot be fragmented, because there is no warranty
 443          *      that different fragments will go along one path. --ANK
 444          */
 445         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 446                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 447                         return 0;
 448         }
 449
 450         /*
 451          *      check and decrement ttl
 452          */
 453         if (hdr->hop_limit <= 1) {
 454                 /* Force OUTPUT device used as source address */
 455                 skb->dev = dst->dev;
 456                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 457                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 458                                 IPSTATS_MIB_INHDRERRORS);
 459
 460                 kfree_skb(skb);
 461                 return -ETIMEDOUT;
 462         }
 463
 464         /* XXX: idev->cnf.proxy_ndp? */
 465         if (net->ipv6.devconf_all->proxy_ndp &&
 466             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 467                 int proxied = ip6_forward_proxy_check(skb);
 468                 if (proxied > 0)
 469                         return ip6_input(skb);
 470                 else if (proxied < 0) {
 471                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
 472                                         IPSTATS_MIB_INDISCARDS);
 473                         goto drop;
 474                 }
 475         }
 476
 477         if (!xfrm6_route_forward(skb)) {
 478                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 479                                 IPSTATS_MIB_INDISCARDS);
 480                 goto drop;
 481         }
 482         dst = skb_dst(skb);
 483
 484         /* IPv6 specs say nothing about it, but it is clear that we cannot
 485            send redirects to source routed frames.
 486            We don't send redirects to frames decapsulated from IPsec.
 487          */
 488         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 489                 struct in6_addr *target = NULL;
 490                 struct inet_peer *peer;
 491                 struct rt6_info *rt;
 492
 493                 /*
 494                  *      incoming and outgoing devices are the same
 495                  *      send a redirect.
 496                  */
 497
 498                 rt = (struct rt6_info *) dst;
 499                 if (rt->rt6i_flags & RTF_GATEWAY)
 500                         target = &rt->rt6i_gateway;
 501                 else
 502                         target = &hdr->daddr;
 503
 504                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 505
 506                 /* Limit redirects both by destination (here)
 507                    and by source (inside ndisc_send_redirect)
 508                  */
 509                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 510                         ndisc_send_redirect(skb, target);
 511                 if (peer)
 512                         inet_putpeer(peer);
 513         } else {
 514                 int addrtype = ipv6_addr_type(&hdr->saddr);
 515
 516                 /* This check is security critical. */
 517                 if (addrtype == IPV6_ADDR_ANY ||
 518                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 519                         goto error;
 520                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 521                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 522                                     ICMPV6_NOT_NEIGHBOUR, 0);
 523                         goto error;
 524                 }
 525         }
 526
 527         mtu = ip6_dst_mtu_forward(dst);
 528         if (mtu < IPV6_MIN_MTU)
 529                 mtu = IPV6_MIN_MTU;
 530
 531         if (ip6_pkt_too_big(skb, mtu)) {
 532                 /* Again, force OUTPUT device used as source address */
 533                 skb->dev = dst->dev;
 534                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 535                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 536                                 IPSTATS_MIB_INTOOBIGERRORS);
 537                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 538                                 IPSTATS_MIB_FRAGFAILS);
 539                 kfree_skb(skb);
 540                 return -EMSGSIZE;
 541         }
 542
 543         if (skb_cow(skb, dst->dev->hard_header_len)) {
 544                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 545                                 IPSTATS_MIB_OUTDISCARDS);
 546                 goto drop;
 547         }
 548
 549         hdr = ipv6_hdr(skb);
 550
 551         /* Mangling hops number delayed to point after skb COW */
 552
 553         hdr->hop_limit--;
 554
 555         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 556         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 557         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 558                        net, NULL, skb, skb->dev, dst->dev,
 559                        ip6_forward_finish);
 560
 561 error:
 562         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 563 drop:
 564         kfree_skb(skb);
 565         return -EINVAL;
 566 }
 567
 568 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 569 {
 570         to->pkt_type = from->pkt_type;
 571         to->priority = from->priority;
 572         to->protocol = from->protocol;
 573         skb_dst_drop(to);
 574         skb_dst_set(to, dst_clone(skb_dst(from)));
 575         to->dev = from->dev;
 576         to->mark = from->mark;
 577
 578 #ifdef CONFIG_NET_SCHED
 579         to->tc_index = from->tc_index;
 580 #endif
 581         nf_copy(to, from);
 582         skb_copy_secmark(to, from);
 583 }
 584
 585 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 586                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 587 {
 588         struct sk_buff *frag;
 589         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 590         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 591                                 inet6_sk(skb->sk) : NULL;
 592         struct ipv6hdr *tmp_hdr;
 593         struct frag_hdr *fh;
 594         unsigned int mtu, hlen, left, len;
 595         int hroom, troom;
 596         __be32 frag_id;
 597         int ptr, offset = 0, err = 0;
 598         u8 *prevhdr, nexthdr = 0;
 599
 600         err = ip6_find_1stfragopt(skb, &prevhdr);
 601         if (err < 0)
 602                 goto fail;
 603         hlen = err;
 604         nexthdr = *prevhdr;
 605
 606         mtu = ip6_skb_dst_mtu(skb);
 607
 608         /* We must not fragment if the socket is set to force MTU discovery
 609          * or if the skb it not generated by a local socket.
 610          */
 611         if (unlikely(!skb->ignore_df && skb->len > mtu))
 612                 goto fail_toobig;
 613
 614         if (IP6CB(skb)->frag_max_size) {
 615                 if (IP6CB(skb)->frag_max_size > mtu)
 616                         goto fail_toobig;
 617
 618                 /* don't send fragments larger than what we received */
 619                 mtu = IP6CB(skb)->frag_max_size;
 620                 if (mtu < IPV6_MIN_MTU)
 621                         mtu = IPV6_MIN_MTU;
 622         }
 623
 624         if (np && np->frag_size < mtu) {
 625                 if (np->frag_size)
 626                         mtu = np->frag_size;
 627         }
 628         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 629                 goto fail_toobig;
 630         mtu -= hlen + sizeof(struct frag_hdr);
 631
 632         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 633                                     &ipv6_hdr(skb)->saddr);
 634
 635         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 636             (err = skb_checksum_help(skb)))
 637                 goto fail;
 638
 639         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 640         if (skb_has_frag_list(skb)) {
 641                 unsigned int first_len = skb_pagelen(skb);
 642                 struct sk_buff *frag2;
 643
 644                 if (first_len - hlen > mtu ||
 645                     ((first_len - hlen) & 7) ||
 646                     skb_cloned(skb) ||
 647                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 648                         goto slow_path;
 649
 650                 skb_walk_frags(skb, frag) {
 651                         /* Correct geometry. */
 652                         if (frag->len > mtu ||
 653                             ((frag->len & 7) && frag->next) ||
 654                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 655                                 goto slow_path_clean;
 656
 657                         /* Partially cloned skb? */
 658                         if (skb_shared(frag))
 659                                 goto slow_path_clean;
 660
 661                         BUG_ON(frag->sk);
 662                         if (skb->sk) {
 663                                 frag->sk = skb->sk;
 664                                 frag->destructor = sock_wfree;
 665                         }
 666                         skb->truesize -= frag->truesize;
 667                 }
 668
 669                 err = 0;
 670                 offset = 0;
 671                 /* BUILD HEADER */
 672
 673                 *prevhdr = NEXTHDR_FRAGMENT;
 674                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 675                 if (!tmp_hdr) {
 676                         err = -ENOMEM;
 677                         goto fail;
 678                 }
 679                 frag = skb_shinfo(skb)->frag_list;
 680                 skb_frag_list_init(skb);
 681
 682                 __skb_pull(skb, hlen);
 683                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 684                 __skb_push(skb, hlen);
 685                 skb_reset_network_header(skb);
 686                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 687
 688                 fh->nexthdr = nexthdr;
 689                 fh->reserved = 0;
 690                 fh->frag_off = htons(IP6_MF);
 691                 fh->identification = frag_id;
 692
 693                 first_len = skb_pagelen(skb);
 694                 skb->data_len = first_len - skb_headlen(skb);
 695                 skb->len = first_len;
 696                 ipv6_hdr(skb)->payload_len = htons(first_len -
 697                                                    sizeof(struct ipv6hdr));
 698
 699                 for (;;) {
 700                         /* Prepare header of the next frame,
 701                          * before previous one went down. */
 702                         if (frag) {
 703                                 frag->ip_summed = CHECKSUM_NONE;
 704                                 skb_reset_transport_header(frag);
 705                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 706                                 __skb_push(frag, hlen);
 707                                 skb_reset_network_header(frag);
 708                                 memcpy(skb_network_header(frag), tmp_hdr,
 709                                        hlen);
 710                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 711                                 fh->nexthdr = nexthdr;
 712                                 fh->reserved = 0;
 713                                 fh->frag_off = htons(offset);
 714                                 if (frag->next)
 715                                         fh->frag_off |= htons(IP6_MF);
 716                                 fh->identification = frag_id;
 717                                 ipv6_hdr(frag)->payload_len =
 718                                                 htons(frag->len -
 719                                                       sizeof(struct ipv6hdr));
 720                                 ip6_copy_metadata(frag, skb);
 721                         }
 722
 723                         err = output(net, sk, skb);
 724                         if (!err)
 725                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 726                                               IPSTATS_MIB_FRAGCREATES);
 727
 728                         if (err || !frag)
 729                                 break;
 730
 731                         skb = frag;
 732                         frag = skb->next;
 733                         skb->next = NULL;
 734                 }
 735
 736                 kfree(tmp_hdr);
 737
 738                 if (err == 0) {
 739                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 740                                       IPSTATS_MIB_FRAGOKS);
 741                         return 0;
 742                 }
 743
 744                 kfree_skb_list(frag);
 745
 746                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 747                               IPSTATS_MIB_FRAGFAILS);
 748                 return err;
 749
 750 slow_path_clean:
 751                 skb_walk_frags(skb, frag2) {
 752                         if (frag2 == frag)
 753                                 break;
 754                         frag2->sk = NULL;
 755                         frag2->destructor = NULL;
 756                         skb->truesize += frag2->truesize;
 757                 }
 758         }
 759
 760 slow_path:
 761         left = skb->len - hlen;         /* Space per frame */
 762         ptr = hlen;                     /* Where to start from */
 763
 764         /*
 765          *      Fragment the datagram.
 766          */
 767
 768         troom = rt->dst.dev->needed_tailroom;
 769
 770         /*
 771          *      Keep copying data until we run out.
 772          */
 773         while (left > 0)        {
 774                 u8 *fragnexthdr_offset;
 775
 776                 len = left;
 777                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 778                 if (len > mtu)
 779                         len = mtu;
 780                 /* IF: we are not sending up to and including the packet end
 781                    then align the next start on an eight byte boundary */
 782                 if (len < left) {
 783                         len &= ~7;
 784                 }
 785
 786                 /* Allocate buffer */
 787                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 788                                  hroom + troom, GFP_ATOMIC);
 789                 if (!frag) {
 790                         err = -ENOMEM;
 791                         goto fail;
 792                 }
 793
 794                 /*
 795                  *      Set up data on packet
 796                  */
 797
 798                 ip6_copy_metadata(frag, skb);
 799                 skb_reserve(frag, hroom);
 800                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 801                 skb_reset_network_header(frag);
 802                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 803                 frag->transport_header = (frag->network_header + hlen +
 804                                           sizeof(struct frag_hdr));
 805
 806                 /*
 807                  *      Charge the memory for the fragment to any owner
 808                  *      it might possess
 809                  */
 810                 if (skb->sk)
 811                         skb_set_owner_w(frag, skb->sk);
 812
 813                 /*
 814                  *      Copy the packet header into the new buffer.
 815                  */
 816                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 817
 818                 fragnexthdr_offset = skb_network_header(frag);
 819                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 820                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 821
 822                 /*
 823                  *      Build fragment header.
 824                  */
 825                 fh->nexthdr = nexthdr;
 826                 fh->reserved = 0;
 827                 fh->identification = frag_id;
 828
 829                 /*
 830                  *      Copy a block of the IP datagram.
 831                  */
 832                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 833                                      len));
 834                 left -= len;
 835
 836                 fh->frag_off = htons(offset);
 837                 if (left > 0)
 838                         fh->frag_off |= htons(IP6_MF);
 839                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 840                                                     sizeof(struct ipv6hdr));
 841
 842                 ptr += len;
 843                 offset += len;
 844
 845                 /*
 846                  *      Put this fragment into the sending queue.
 847                  */
 848                 err = output(net, sk, frag);
 849                 if (err)
 850                         goto fail;
 851
 852                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 853                               IPSTATS_MIB_FRAGCREATES);
 854         }
 855         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 856                       IPSTATS_MIB_FRAGOKS);
 857         consume_skb(skb);
 858         return err;
 859
 860 fail_toobig:
 861         if (skb->sk && dst_allfrag(skb_dst(skb)))
 862                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 863
 864         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 865         err = -EMSGSIZE;
 866
 867 fail:
 868         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 869                       IPSTATS_MIB_FRAGFAILS);
 870         kfree_skb(skb);
 871         return err;
 872 }
 873
 874 static inline int ip6_rt_check(const struct rt6key *rt_key,
 875                                const struct in6_addr *fl_addr,
 876                                const struct in6_addr *addr_cache)
 877 {
 878         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 879                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 880 }
 881
 882 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 883                                           struct dst_entry *dst,
 884                                           const struct flowi6 *fl6)
 885 {
 886         struct ipv6_pinfo *np = inet6_sk(sk);
 887         struct rt6_info *rt;
 888
 889         if (!dst)
 890                 goto out;
 891
 892         if (dst->ops->family != AF_INET6) {
 893                 dst_release(dst);
 894                 return NULL;
 895         }
 896
 897         rt = (struct rt6_info *)dst;
 898         /* Yes, checking route validity in not connected
 899          * case is not very simple. Take into account,
 900          * that we do not support routing by source, TOS,
 901          * and MSG_DONTROUTE            --ANK (980726)
 902          *
 903          * 1. ip6_rt_check(): If route was host route,
 904          *    check that cached destination is current.
 905          *    If it is network route, we still may
 906          *    check its validity using saved pointer
 907          *    to the last used address: daddr_cache.
 908          *    We do not want to save whole address now,
 909          *    (because main consumer of this service
 910          *    is tcp, which has not this problem),
 911          *    so that the last trick works only on connected
 912          *    sockets.
 913          * 2. oif also should be the same.
 914          */
 915         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 916 #ifdef CONFIG_IPV6_SUBTREES
 917             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 918 #endif
 919            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 920               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 921                 dst_release(dst);
 922                 dst = NULL;
 923         }
 924
 925 out:
 926         return dst;
 927 }
 928
 929 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 930                                struct dst_entry **dst, struct flowi6 *fl6)
 931 {
 932 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 933         struct neighbour *n;
 934         struct rt6_info *rt;
 935 #endif
 936         int err;
 937         int flags = 0;
 938
 939         /* The correct way to handle this would be to do
 940          * ip6_route_get_saddr, and then ip6_route_output; however,
 941          * the route-specific preferred source forces the
 942          * ip6_route_output call _before_ ip6_route_get_saddr.
 943          *
 944          * In source specific routing (no src=any default route),
 945          * ip6_route_output will fail given src=any saddr, though, so
 946          * that's why we try it again later.
 947          */
 948         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 949                 struct rt6_info *rt;
 950                 bool had_dst = *dst != NULL;
 951
 952                 if (!had_dst)
 953                         *dst = ip6_route_output(net, sk, fl6);
 954                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 955                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 956                                           sk ? inet6_sk(sk)->srcprefs : 0,
 957                                           &fl6->saddr);
 958                 if (err)
 959                         goto out_err_release;
 960
 961                 /* If we had an erroneous initial result, pretend it
 962                  * never existed and let the SA-enabled version take
 963                  * over.
 964                  */
 965                 if (!had_dst && (*dst)->error) {
 966                         dst_release(*dst);
 967                         *dst = NULL;
 968                 }
 969
 970                 if (fl6->flowi6_oif)
 971                         flags |= RT6_LOOKUP_F_IFACE;
 972         }
 973
 974         if (!*dst)
 975                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 976
 977         err = (*dst)->error;
 978         if (err)
 979                 goto out_err_release;
 980
 981 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 982         /*
 983          * Here if the dst entry we've looked up
 984          * has a neighbour entry that is in the INCOMPLETE
 985          * state and the src address from the flow is
 986          * marked as OPTIMISTIC, we release the found
 987          * dst entry and replace it instead with the
 988          * dst entry of the nexthop router
 989          */
 990         rt = (struct rt6_info *) *dst;
 991         rcu_read_lock_bh();
 992         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 993                                       rt6_nexthop(rt, &fl6->daddr));
 994         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 995         rcu_read_unlock_bh();
 996
 997         if (err) {
 998                 struct inet6_ifaddr *ifp;
 999                 struct flowi6 fl_gw6;
1000                 int redirect;
1001
1002                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1003                                       (*dst)->dev, 1);
1004
1005                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1006                 if (ifp)
1007                         in6_ifa_put(ifp);
1008
1009                 if (redirect) {
1010                         /*
1011                          * We need to get the dst entry for the
1012                          * default router instead
1013                          */
1014                         dst_release(*dst);
1015                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1016                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1017                         *dst = ip6_route_output(net, sk, &fl_gw6);
1018                         err = (*dst)->error;
1019                         if (err)
1020                                 goto out_err_release;
1021                 }
1022         }
1023 #endif
1024         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1025             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1026                 err = -EAFNOSUPPORT;
1027                 goto out_err_release;
1028         }
1029
1030         return 0;
1031
1032 out_err_release:
1033         dst_release(*dst);
1034         *dst = NULL;
1035
1036         if (err == -ENETUNREACH)
1037                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1038         return err;
1039 }
1040
1041 /**
1042  *      ip6_dst_lookup - perform route lookup on flow
1043  *      @sk: socket which provides route info
1044  *      @dst: pointer to dst_entry * for result
1045  *      @fl6: flow to lookup
1046  *
1047  *      This function performs a route lookup on the given flow.
1048  *
1049  *      It returns zero on success, or a standard errno code on error.
1050  */
1051 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1052                    struct flowi6 *fl6)
1053 {
1054         *dst = NULL;
1055         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1056 }
1057 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1058
1059 /**
1060  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1061  *      @sk: socket which provides route info
1062  *      @fl6: flow to lookup
1063  *      @final_dst: final destination address for ipsec lookup
1064  *
1065  *      This function performs a route lookup on the given flow.
1066  *
1067  *      It returns a valid dst pointer on success, or a pointer encoded
1068  *      error code.
1069  */
1070 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1071                                       const struct in6_addr *final_dst)
1072 {
1073         struct dst_entry *dst = NULL;
1074         int err;
1075
1076         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1077         if (err)
1078                 return ERR_PTR(err);
1079         if (final_dst)
1080                 fl6->daddr = *final_dst;
1081
1082         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1083 }
1084 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1085
1086 /**
1087  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1088  *      @sk: socket which provides the dst cache and route info
1089  *      @fl6: flow to lookup
1090  *      @final_dst: final destination address for ipsec lookup
1091  *
1092  *      This function performs a route lookup on the given flow with the
1093  *      possibility of using the cached route in the socket if it is valid.
1094  *      It will take the socket dst lock when operating on the dst cache.
1095  *      As a result, this function can only be used in process context.
1096  *
1097  *      It returns a valid dst pointer on success, or a pointer encoded
1098  *      error code.
1099  */
1100 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1101                                          const struct in6_addr *final_dst)
1102 {
1103         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1104
1105         dst = ip6_sk_dst_check(sk, dst, fl6);
1106         if (!dst)
1107                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1108
1109         return dst;
1110 }
1111 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1112
1113 static inline int ip6_ufo_append_data(struct sock *sk,
1114                         struct sk_buff_head *queue,
1115                         int getfrag(void *from, char *to, int offset, int len,
1116                         int odd, struct sk_buff *skb),
1117                         void *from, int length, int hh_len, int fragheaderlen,
1118                         int exthdrlen, int transhdrlen, int mtu,
1119                         unsigned int flags, const struct flowi6 *fl6)
1120
1121 {
1122         struct sk_buff *skb;
1123         int err;
1124
1125         /* There is support for UDP large send offload by network
1126          * device, so create one single skb packet containing complete
1127          * udp datagram
1128          */
1129         skb = skb_peek_tail(queue);
1130         if (!skb) {
1131                 skb = sock_alloc_send_skb(sk,
1132                         hh_len + fragheaderlen + transhdrlen + 20,
1133                         (flags & MSG_DONTWAIT), &err);
1134                 if (!skb)
1135                         return err;
1136
1137                 /* reserve space for Hardware header */
1138                 skb_reserve(skb, hh_len);
1139
1140                 /* create space for UDP/IP header */
1141                 skb_put(skb, fragheaderlen + transhdrlen);
1142
1143                 /* initialize network header pointer */
1144                 skb_set_network_header(skb, exthdrlen);
1145
1146                 /* initialize protocol header pointer */
1147                 skb->transport_header = skb->network_header + fragheaderlen;
1148
1149                 skb->protocol = htons(ETH_P_IPV6);
1150                 skb->csum = 0;
1151
1152                 if (flags & MSG_CONFIRM)
1153                         skb_set_dst_pending_confirm(skb, 1);
1154
1155                 __skb_queue_tail(queue, skb);
1156         } else if (skb_is_gso(skb)) {
1157                 goto append;
1158         }
1159
1160         skb->ip_summed = CHECKSUM_PARTIAL;
1161         /* Specify the length of each IPv6 datagram fragment.
1162          * It has to be a multiple of 8.
1163          */
1164         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1165                                      sizeof(struct frag_hdr)) & ~7;
1166         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1167         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1168                                                          &fl6->daddr,
1169                                                          &fl6->saddr);
1170
1171 append:
1172         return skb_append_datato_frags(sk, skb, getfrag, from,
1173                                        (length - transhdrlen));
1174 }
1175
1176 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1177                                                gfp_t gfp)
1178 {
1179         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1180 }
1181
1182 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1183                                                 gfp_t gfp)
1184 {
1185         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1186 }
1187
1188 static void ip6_append_data_mtu(unsigned int *mtu,
1189                                 int *maxfraglen,
1190                                 unsigned int fragheaderlen,
1191                                 struct sk_buff *skb,
1192                                 struct rt6_info *rt,
1193                                 unsigned int orig_mtu)
1194 {
1195         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1196                 if (!skb) {
1197                         /* first fragment, reserve header_len */
1198                         *mtu = orig_mtu - rt->dst.header_len;
1199
1200                 } else {
1201                         /*
1202                          * this fragment is not first, the headers
1203                          * space is regarded as data space.
1204                          */
1205                         *mtu = orig_mtu;
1206                 }
1207                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1208                               + fragheaderlen - sizeof(struct frag_hdr);
1209         }
1210 }
1211
1212 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1213                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1214                           struct rt6_info *rt, struct flowi6 *fl6)
1215 {
1216         struct ipv6_pinfo *np = inet6_sk(sk);
1217         unsigned int mtu;
1218         struct ipv6_txoptions *opt = ipc6->opt;
1219
1220         /*
1221          * setup for corking
1222          */
1223         if (opt) {
1224                 if (WARN_ON(v6_cork->opt))
1225                         return -EINVAL;
1226
1227                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1228                 if (unlikely(!v6_cork->opt))
1229                         return -ENOBUFS;
1230
1231                 v6_cork->opt->tot_len = sizeof(*opt);
1232                 v6_cork->opt->opt_flen = opt->opt_flen;
1233                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1234
1235                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1236                                                     sk->sk_allocation);
1237                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1238                         return -ENOBUFS;
1239
1240                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1241                                                     sk->sk_allocation);
1242                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1243                         return -ENOBUFS;
1244
1245                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1246                                                    sk->sk_allocation);
1247                 if (opt->hopopt && !v6_cork->opt->hopopt)
1248                         return -ENOBUFS;
1249
1250                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1251                                                     sk->sk_allocation);
1252                 if (opt->srcrt && !v6_cork->opt->srcrt)
1253                         return -ENOBUFS;
1254
1255                 /* need source address above miyazawa*/
1256         }
1257         dst_hold(&rt->dst);
1258         cork->base.dst = &rt->dst;
1259         cork->fl.u.ip6 = *fl6;
1260         v6_cork->hop_limit = ipc6->hlimit;
1261         v6_cork->tclass = ipc6->tclass;
1262         if (rt->dst.flags & DST_XFRM_TUNNEL)
1263                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1264                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1265         else
1266                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1267                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1268         if (np->frag_size < mtu) {
1269                 if (np->frag_size)
1270                         mtu = np->frag_size;
1271         }
1272         cork->base.fragsize = mtu;
1273         if (dst_allfrag(rt->dst.path))
1274                 cork->base.flags |= IPCORK_ALLFRAG;
1275         cork->base.length = 0;
1276
1277         return 0;
1278 }
1279
1280 static int __ip6_append_data(struct sock *sk,
1281                              struct flowi6 *fl6,
1282                              struct sk_buff_head *queue,
1283                              struct inet_cork *cork,
1284                              struct inet6_cork *v6_cork,
1285                              struct page_frag *pfrag,
1286                              int getfrag(void *from, char *to, int offset,
1287                                          int len, int odd, struct sk_buff *skb),
1288                              void *from, int length, int transhdrlen,
1289                              unsigned int flags, struct ipcm6_cookie *ipc6,
1290                              const struct sockcm_cookie *sockc)
1291 {
1292         struct sk_buff *skb, *skb_prev = NULL;
1293         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1294         int exthdrlen = 0;
1295         int dst_exthdrlen = 0;
1296         int hh_len;
1297         int copy;
1298         int err;
1299         int offset = 0;
1300         __u8 tx_flags = 0;
1301         u32 tskey = 0;
1302         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1303         struct ipv6_txoptions *opt = v6_cork->opt;
1304         int csummode = CHECKSUM_NONE;
1305         unsigned int maxnonfragsize, headersize;
1306
1307         skb = skb_peek_tail(queue);
1308         if (!skb) {
1309                 exthdrlen = opt ? opt->opt_flen : 0;
1310                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1311         }
1312
1313         mtu = cork->fragsize;
1314         orig_mtu = mtu;
1315
1316         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1317
1318         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1319                         (opt ? opt->opt_nflen : 0);
1320         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1321                      sizeof(struct frag_hdr);
1322
1323         headersize = sizeof(struct ipv6hdr) +
1324                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1325                      (dst_allfrag(&rt->dst) ?
1326                       sizeof(struct frag_hdr) : 0) +
1327                      rt->rt6i_nfheader_len;
1328
1329         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1330             (sk->sk_protocol == IPPROTO_UDP ||
1331              sk->sk_protocol == IPPROTO_RAW)) {
1332                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1333                                 sizeof(struct ipv6hdr));
1334                 goto emsgsize;
1335         }
1336
1337         if (ip6_sk_ignore_df(sk))
1338                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1339         else
1340                 maxnonfragsize = mtu;
1341
1342         if (cork->length + length > maxnonfragsize - headersize) {
1343 emsgsize:
1344                 ipv6_local_error(sk, EMSGSIZE, fl6,
1345                                  mtu - headersize +
1346                                  sizeof(struct ipv6hdr));
1347                 return -EMSGSIZE;
1348         }
1349
1350         /* CHECKSUM_PARTIAL only with no extension headers and when
1351          * we are not going to fragment
1352          */
1353         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1354             headersize == sizeof(struct ipv6hdr) &&
1355             length <= mtu - headersize &&
1356             !(flags & MSG_MORE) &&
1357             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1358                 csummode = CHECKSUM_PARTIAL;
1359
1360         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1361                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1362                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1363                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1364                         tskey = sk->sk_tskey++;
1365         }
1366
1367         /*
1368          * Let's try using as much space as possible.
1369          * Use MTU if total length of the message fits into the MTU.
1370          * Otherwise, we need to reserve fragment header and
1371          * fragment alignment (= 8-15 octects, in total).
1372          *
1373          * Note that we may need to "move" the data from the tail of
1374          * of the buffer to the new fragment when we split
1375          * the message.
1376          *
1377          * FIXME: It may be fragmented into multiple chunks
1378          *        at once if non-fragmentable extension headers
1379          *        are too large.
1380          * --yoshfuji
1381          */
1382
1383         cork->length += length;
1384         if ((skb && skb_is_gso(skb)) ||
1385             (((length + (skb ? skb->len : headersize)) > mtu) &&
1386             (skb_queue_len(queue) <= 1) &&
1387             (sk->sk_protocol == IPPROTO_UDP) &&
1388             (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
1389             (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk))) {
1390                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1391                                           hh_len, fragheaderlen, exthdrlen,
1392                                           transhdrlen, mtu, flags, fl6);
1393                 if (err)
1394                         goto error;
1395                 return 0;
1396         }
1397
1398         if (!skb)
1399                 goto alloc_new_skb;
1400
1401         while (length > 0) {
1402                 /* Check if the remaining data fits into current packet. */
1403                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1404                 if (copy < length)
1405                         copy = maxfraglen - skb->len;
1406
1407                 if (copy <= 0) {
1408                         char *data;
1409                         unsigned int datalen;
1410                         unsigned int fraglen;
1411                         unsigned int fraggap;
1412                         unsigned int alloclen;
1413 alloc_new_skb:
1414                         /* There's no room in the current skb */
1415                         if (skb)
1416                                 fraggap = skb->len - maxfraglen;
1417                         else
1418                                 fraggap = 0;
1419                         /* update mtu and maxfraglen if necessary */
1420                         if (!skb || !skb_prev)
1421                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1422                                                     fragheaderlen, skb, rt,
1423                                                     orig_mtu);
1424
1425                         skb_prev = skb;
1426
1427                         /*
1428                          * If remaining data exceeds the mtu,
1429                          * we know we need more fragment(s).
1430                          */
1431                         datalen = length + fraggap;
1432
1433                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1434                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1435                         if ((flags & MSG_MORE) &&
1436                             !(rt->dst.dev->features&NETIF_F_SG))
1437                                 alloclen = mtu;
1438                         else
1439                                 alloclen = datalen + fragheaderlen;
1440
1441                         alloclen += dst_exthdrlen;
1442
1443                         if (datalen != length + fraggap) {
1444                                 /*
1445                                  * this is not the last fragment, the trailer
1446                                  * space is regarded as data space.
1447                                  */
1448                                 datalen += rt->dst.trailer_len;
1449                         }
1450
1451                         alloclen += rt->dst.trailer_len;
1452                         fraglen = datalen + fragheaderlen;
1453
1454                         /*
1455                          * We just reserve space for fragment header.
1456                          * Note: this may be overallocation if the message
1457                          * (without MSG_MORE) fits into the MTU.
1458                          */
1459                         alloclen += sizeof(struct frag_hdr);
1460
1461                         copy = datalen - transhdrlen - fraggap;
1462                         if (copy < 0) {
1463                                 err = -EINVAL;
1464                                 goto error;
1465                         }
1466                         if (transhdrlen) {
1467                                 skb = sock_alloc_send_skb(sk,
1468                                                 alloclen + hh_len,
1469                                                 (flags & MSG_DONTWAIT), &err);
1470                         } else {
1471                                 skb = NULL;
1472                                 if (refcount_read(&sk->sk_wmem_alloc) <=
1473                                     2 * sk->sk_sndbuf)
1474                                         skb = sock_wmalloc(sk,
1475                                                            alloclen + hh_len, 1,
1476                                                            sk->sk_allocation);
1477                                 if (unlikely(!skb))
1478                                         err = -ENOBUFS;
1479                         }
1480                         if (!skb)
1481                                 goto error;
1482                         /*
1483                          *      Fill in the control structures
1484                          */
1485                         skb->protocol = htons(ETH_P_IPV6);
1486                         skb->ip_summed = csummode;
1487                         skb->csum = 0;
1488                         /* reserve for fragmentation and ipsec header */
1489                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1490                                     dst_exthdrlen);
1491
1492                         /* Only the initial fragment is time stamped */
1493                         skb_shinfo(skb)->tx_flags = tx_flags;
1494                         tx_flags = 0;
1495                         skb_shinfo(skb)->tskey = tskey;
1496                         tskey = 0;
1497
1498                         /*
1499                          *      Find where to start putting bytes
1500                          */
1501                         data = skb_put(skb, fraglen);
1502                         skb_set_network_header(skb, exthdrlen);
1503                         data += fragheaderlen;
1504                         skb->transport_header = (skb->network_header +
1505                                                  fragheaderlen);
1506                         if (fraggap) {
1507                                 skb->csum = skb_copy_and_csum_bits(
1508                                         skb_prev, maxfraglen,
1509                                         data + transhdrlen, fraggap, 0);
1510                                 skb_prev->csum = csum_sub(skb_prev->csum,
1511                                                           skb->csum);
1512                                 data += fraggap;
1513                                 pskb_trim_unique(skb_prev, maxfraglen);
1514                         }
1515                         if (copy > 0 &&
1516                             getfrag(from, data + transhdrlen, offset,
1517                                     copy, fraggap, skb) < 0) {
1518                                 err = -EFAULT;
1519                                 kfree_skb(skb);
1520                                 goto error;
1521                         }
1522
1523                         offset += copy;
1524                         length -= datalen - fraggap;
1525                         transhdrlen = 0;
1526                         exthdrlen = 0;
1527                         dst_exthdrlen = 0;
1528
1529                         if ((flags & MSG_CONFIRM) && !skb_prev)
1530                                 skb_set_dst_pending_confirm(skb, 1);
1531
1532                         /*
1533                          * Put the packet on the pending queue
1534                          */
1535                         __skb_queue_tail(queue, skb);
1536                         continue;
1537                 }
1538
1539                 if (copy > length)
1540                         copy = length;
1541
1542                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1543                         unsigned int off;
1544
1545                         off = skb->len;
1546                         if (getfrag(from, skb_put(skb, copy),
1547                                                 offset, copy, off, skb) < 0) {
1548                                 __skb_trim(skb, off);
1549                                 err = -EFAULT;
1550                                 goto error;
1551                         }
1552                 } else {
1553                         int i = skb_shinfo(skb)->nr_frags;
1554
1555                         err = -ENOMEM;
1556                         if (!sk_page_frag_refill(sk, pfrag))
1557                                 goto error;
1558
1559                         if (!skb_can_coalesce(skb, i, pfrag->page,
1560                                               pfrag->offset)) {
1561                                 err = -EMSGSIZE;
1562                                 if (i == MAX_SKB_FRAGS)
1563                                         goto error;
1564
1565                                 __skb_fill_page_desc(skb, i, pfrag->page,
1566                                                      pfrag->offset, 0);
1567                                 skb_shinfo(skb)->nr_frags = ++i;
1568                                 get_page(pfrag->page);
1569                         }
1570                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1571                         if (getfrag(from,
1572                                     page_address(pfrag->page) + pfrag->offset,
1573                                     offset, copy, skb->len, skb) < 0)
1574                                 goto error_efault;
1575
1576                         pfrag->offset += copy;
1577                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1578                         skb->len += copy;
1579                         skb->data_len += copy;
1580                         skb->truesize += copy;
1581                         refcount_add(copy, &sk->sk_wmem_alloc);
1582                 }
1583                 offset += copy;
1584                 length -= copy;
1585         }
1586
1587         return 0;
1588
1589 error_efault:
1590         err = -EFAULT;
1591 error:
1592         cork->length -= length;
1593         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1594         return err;
1595 }
1596
1597 int ip6_append_data(struct sock *sk,
1598                     int getfrag(void *from, char *to, int offset, int len,
1599                                 int odd, struct sk_buff *skb),
1600                     void *from, int length, int transhdrlen,
1601                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1602                     struct rt6_info *rt, unsigned int flags,
1603                     const struct sockcm_cookie *sockc)
1604 {
1605         struct inet_sock *inet = inet_sk(sk);
1606         struct ipv6_pinfo *np = inet6_sk(sk);
1607         int exthdrlen;
1608         int err;
1609
1610         if (flags&MSG_PROBE)
1611                 return 0;
1612         if (skb_queue_empty(&sk->sk_write_queue)) {
1613                 /*
1614                  * setup for corking
1615                  */
1616                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1617                                      ipc6, rt, fl6);
1618                 if (err)
1619                         return err;
1620
1621                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1622                 length += exthdrlen;
1623                 transhdrlen += exthdrlen;
1624         } else {
1625                 fl6 = &inet->cork.fl.u.ip6;
1626                 transhdrlen = 0;
1627         }
1628
1629         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1630                                  &np->cork, sk_page_frag(sk), getfrag,
1631                                  from, length, transhdrlen, flags, ipc6, sockc);
1632 }
1633 EXPORT_SYMBOL_GPL(ip6_append_data);
1634
1635 static void ip6_cork_release(struct inet_cork_full *cork,
1636                              struct inet6_cork *v6_cork)
1637 {
1638         if (v6_cork->opt) {
1639                 kfree(v6_cork->opt->dst0opt);
1640                 kfree(v6_cork->opt->dst1opt);
1641                 kfree(v6_cork->opt->hopopt);
1642                 kfree(v6_cork->opt->srcrt);
1643                 kfree(v6_cork->opt);
1644                 v6_cork->opt = NULL;
1645         }
1646
1647         if (cork->base.dst) {
1648                 dst_release(cork->base.dst);
1649                 cork->base.dst = NULL;
1650                 cork->base.flags &= ~IPCORK_ALLFRAG;
1651         }
1652         memset(&cork->fl, 0, sizeof(cork->fl));
1653 }
1654
1655 struct sk_buff *__ip6_make_skb(struct sock *sk,
1656                                struct sk_buff_head *queue,
1657                                struct inet_cork_full *cork,
1658                                struct inet6_cork *v6_cork)
1659 {
1660         struct sk_buff *skb, *tmp_skb;
1661         struct sk_buff **tail_skb;
1662         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1663         struct ipv6_pinfo *np = inet6_sk(sk);
1664         struct net *net = sock_net(sk);
1665         struct ipv6hdr *hdr;
1666         struct ipv6_txoptions *opt = v6_cork->opt;
1667         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1668         struct flowi6 *fl6 = &cork->fl.u.ip6;
1669         unsigned char proto = fl6->flowi6_proto;
1670
1671         skb = __skb_dequeue(queue);
1672         if (!skb)
1673                 goto out;
1674         tail_skb = &(skb_shinfo(skb)->frag_list);
1675
1676         /* move skb->data to ip header from ext header */
1677         if (skb->data < skb_network_header(skb))
1678                 __skb_pull(skb, skb_network_offset(skb));
1679         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1680                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1681                 *tail_skb = tmp_skb;
1682                 tail_skb = &(tmp_skb->next);
1683                 skb->len += tmp_skb->len;
1684                 skb->data_len += tmp_skb->len;
1685                 skb->truesize += tmp_skb->truesize;
1686                 tmp_skb->destructor = NULL;
1687                 tmp_skb->sk = NULL;
1688         }
1689
1690         /* Allow local fragmentation. */
1691         skb->ignore_df = ip6_sk_ignore_df(sk);
1692
1693         *final_dst = fl6->daddr;
1694         __skb_pull(skb, skb_network_header_len(skb));
1695         if (opt && opt->opt_flen)
1696                 ipv6_push_frag_opts(skb, opt, &proto);
1697         if (opt && opt->opt_nflen)
1698                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1699
1700         skb_push(skb, sizeof(struct ipv6hdr));
1701         skb_reset_network_header(skb);
1702         hdr = ipv6_hdr(skb);
1703
1704         ip6_flow_hdr(hdr, v6_cork->tclass,
1705                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1706                                         np->autoflowlabel, fl6));
1707         hdr->hop_limit = v6_cork->hop_limit;
1708         hdr->nexthdr = proto;
1709         hdr->saddr = fl6->saddr;
1710         hdr->daddr = *final_dst;
1711
1712         skb->priority = sk->sk_priority;
1713         skb->mark = sk->sk_mark;
1714
1715         skb_dst_set(skb, dst_clone(&rt->dst));
1716         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1717         if (proto == IPPROTO_ICMPV6) {
1718                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1719
1720                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1721                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1722         }
1723
1724         ip6_cork_release(cork, v6_cork);
1725 out:
1726         return skb;
1727 }
1728
1729 int ip6_send_skb(struct sk_buff *skb)
1730 {
1731         struct net *net = sock_net(skb->sk);
1732         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1733         int err;
1734
1735         err = ip6_local_out(net, skb->sk, skb);
1736         if (err) {
1737                 if (err > 0)
1738                         err = net_xmit_errno(err);
1739                 if (err)
1740                         IP6_INC_STATS(net, rt->rt6i_idev,
1741                                       IPSTATS_MIB_OUTDISCARDS);
1742         }
1743
1744         return err;
1745 }
1746
1747 int ip6_push_pending_frames(struct sock *sk)
1748 {
1749         struct sk_buff *skb;
1750
1751         skb = ip6_finish_skb(sk);
1752         if (!skb)
1753                 return 0;
1754
1755         return ip6_send_skb(skb);
1756 }
1757 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1758
1759 static void __ip6_flush_pending_frames(struct sock *sk,
1760                                        struct sk_buff_head *queue,
1761                                        struct inet_cork_full *cork,
1762                                        struct inet6_cork *v6_cork)
1763 {
1764         struct sk_buff *skb;
1765
1766         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1767                 if (skb_dst(skb))
1768                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1769                                       IPSTATS_MIB_OUTDISCARDS);
1770                 kfree_skb(skb);
1771         }
1772
1773         ip6_cork_release(cork, v6_cork);
1774 }
1775
1776 void ip6_flush_pending_frames(struct sock *sk)
1777 {
1778         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1779                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1780 }
1781 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1782
1783 struct sk_buff *ip6_make_skb(struct sock *sk,
1784                              int getfrag(void *from, char *to, int offset,
1785                                          int len, int odd, struct sk_buff *skb),
1786                              void *from, int length, int transhdrlen,
1787                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1788                              struct rt6_info *rt, unsigned int flags,
1789                              const struct sockcm_cookie *sockc)
1790 {
1791         struct inet_cork_full cork;
1792         struct inet6_cork v6_cork;
1793         struct sk_buff_head queue;
1794         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1795         int err;
1796
1797         if (flags & MSG_PROBE)
1798                 return NULL;
1799
1800         __skb_queue_head_init(&queue);
1801
1802         cork.base.flags = 0;
1803         cork.base.addr = 0;
1804         cork.base.opt = NULL;
1805         v6_cork.opt = NULL;
1806         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1807         if (err)
1808                 return ERR_PTR(err);
1809
1810         if (ipc6->dontfrag < 0)
1811                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1812
1813         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1814                                 &current->task_frag, getfrag, from,
1815                                 length + exthdrlen, transhdrlen + exthdrlen,
1816                                 flags, ipc6, sockc);
1817         if (err) {
1818                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1819                 return ERR_PTR(err);
1820         }
1821
1822         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1823 }