net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_is_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132 {
 133         int ret;
 134
 135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136         if (ret) {
 137                 kfree_skb(skb);
 138                 return ret;
 139         }
 140
 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 142         /* Policy lookup after SNAT yielded a new policy */
 143         if (skb_dst(skb)->xfrm) {
 144                 IPCB(skb)->flags |= IPSKB_REROUTED;
 145                 return dst_output(net, sk, skb);
 146         }
 147 #endif
 148
 149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 150             dst_allfrag(skb_dst(skb)) ||
 151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 153         else
 154                 return ip6_finish_output2(net, sk, skb);
 155 }
 156
 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 158 {
 159         struct net_device *dev = skb_dst(skb)->dev;
 160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 161
 162         skb->protocol = htons(ETH_P_IPV6);
 163         skb->dev = dev;
 164
 165         if (unlikely(idev->cnf.disable_ipv6)) {
 166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 167                 kfree_skb(skb);
 168                 return 0;
 169         }
 170
 171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 172                             net, sk, skb, NULL, dev,
 173                             ip6_finish_output,
 174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 175 }
 176
 177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 178 {
 179         if (!np->autoflowlabel_set)
 180                 return ip6_default_np_autolabel(net);
 181         else
 182                 return np->autoflowlabel;
 183 }
 184
 185 /*
 186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 187  * Note : socket lock is not held for SYNACK packets, but might be modified
 188  * by calls to skb_set_owner_w() and ipv6_local_error(),
 189  * which are using proper atomic operations or spinlocks.
 190  */
 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 193 {
 194         struct net *net = sock_net(sk);
 195         const struct ipv6_pinfo *np = inet6_sk(sk);
 196         struct in6_addr *first_hop = &fl6->daddr;
 197         struct dst_entry *dst = skb_dst(skb);
 198         struct ipv6hdr *hdr;
 199         u8  proto = fl6->flowi6_proto;
 200         int seg_len = skb->len;
 201         int hlimit = -1;
 202         u32 mtu;
 203
 204         if (opt) {
 205                 unsigned int head_room;
 206
 207                 /* First: exthdrs may take lots of space (~8K for now)
 208                    MAX_HEADER is not enough.
 209                  */
 210                 head_room = opt->opt_nflen + opt->opt_flen;
 211                 seg_len += head_room;
 212                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 213
 214                 if (skb_headroom(skb) < head_room) {
 215                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 216                         if (!skb2) {
 217                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 218                                               IPSTATS_MIB_OUTDISCARDS);
 219                                 kfree_skb(skb);
 220                                 return -ENOBUFS;
 221                         }
 222                         consume_skb(skb);
 223                         skb = skb2;
 224                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 225                          * it is safe to call in our context (socket lock not held)
 226                          */
 227                         skb_set_owner_w(skb, (struct sock *)sk);
 228                 }
 229                 if (opt->opt_flen)
 230                         ipv6_push_frag_opts(skb, opt, &proto);
 231                 if (opt->opt_nflen)
 232                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 233                                              &fl6->saddr);
 234         }
 235
 236         skb_push(skb, sizeof(struct ipv6hdr));
 237         skb_reset_network_header(skb);
 238         hdr = ipv6_hdr(skb);
 239
 240         /*
 241          *      Fill in the IPv6 header
 242          */
 243         if (np)
 244                 hlimit = np->hop_limit;
 245         if (hlimit < 0)
 246                 hlimit = ip6_dst_hoplimit(dst);
 247
 248         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 249                                 ip6_autoflowlabel(net, np), fl6));
 250
 251         hdr->payload_len = htons(seg_len);
 252         hdr->nexthdr = proto;
 253         hdr->hop_limit = hlimit;
 254
 255         hdr->saddr = fl6->saddr;
 256         hdr->daddr = *first_hop;
 257
 258         skb->protocol = htons(ETH_P_IPV6);
 259         skb->priority = sk->sk_priority;
 260         skb->mark = mark;
 261
 262         mtu = dst_mtu(dst);
 263         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 264                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 265                               IPSTATS_MIB_OUT, skb->len);
 266
 267                 /* if egress device is enslaved to an L3 master device pass the
 268                  * skb to its handler for processing
 269                  */
 270                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 271                 if (unlikely(!skb))
 272                         return 0;
 273
 274                 /* hooks should never assume socket lock is held.
 275                  * we promote our socket to non const
 276                  */
 277                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 278                                net, (struct sock *)sk, skb, NULL, dst->dev,
 279                                dst_output);
 280         }
 281
 282         skb->dev = dst->dev;
 283         /* ipv6_local_error() does not require socket lock,
 284          * we promote our socket to non const
 285          */
 286         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 287
 288         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 289         kfree_skb(skb);
 290         return -EMSGSIZE;
 291 }
 292 EXPORT_SYMBOL(ip6_xmit);
 293
 294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 295 {
 296         struct ip6_ra_chain *ra;
 297         struct sock *last = NULL;
 298
 299         read_lock(&ip6_ra_lock);
 300         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 301                 struct sock *sk = ra->sk;
 302                 if (sk && ra->sel == sel &&
 303                     (!sk->sk_bound_dev_if ||
 304                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 305                         if (last) {
 306                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 307                                 if (skb2)
 308                                         rawv6_rcv(last, skb2);
 309                         }
 310                         last = sk;
 311                 }
 312         }
 313
 314         if (last) {
 315                 rawv6_rcv(last, skb);
 316                 read_unlock(&ip6_ra_lock);
 317                 return 1;
 318         }
 319         read_unlock(&ip6_ra_lock);
 320         return 0;
 321 }
 322
 323 static int ip6_forward_proxy_check(struct sk_buff *skb)
 324 {
 325         struct ipv6hdr *hdr = ipv6_hdr(skb);
 326         u8 nexthdr = hdr->nexthdr;
 327         __be16 frag_off;
 328         int offset;
 329
 330         if (ipv6_ext_hdr(nexthdr)) {
 331                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 332                 if (offset < 0)
 333                         return 0;
 334         } else
 335                 offset = sizeof(struct ipv6hdr);
 336
 337         if (nexthdr == IPPROTO_ICMPV6) {
 338                 struct icmp6hdr *icmp6;
 339
 340                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 341                                          offset + 1 - skb->data)))
 342                         return 0;
 343
 344                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 345
 346                 switch (icmp6->icmp6_type) {
 347                 case NDISC_ROUTER_SOLICITATION:
 348                 case NDISC_ROUTER_ADVERTISEMENT:
 349                 case NDISC_NEIGHBOUR_SOLICITATION:
 350                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 351                 case NDISC_REDIRECT:
 352                         /* For reaction involving unicast neighbor discovery
 353                          * message destined to the proxied address, pass it to
 354                          * input function.
 355                          */
 356                         return 1;
 357                 default:
 358                         break;
 359                 }
 360         }
 361
 362         /*
 363          * The proxying router can't forward traffic sent to a link-local
 364          * address, so signal the sender and discard the packet. This
 365          * behavior is clarified by the MIPv6 specification.
 366          */
 367         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 368                 dst_link_failure(skb);
 369                 return -1;
 370         }
 371
 372         return 0;
 373 }
 374
 375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 376                                      struct sk_buff *skb)
 377 {
 378         struct dst_entry *dst = skb_dst(skb);
 379
 380         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 381         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 382
 383         return dst_output(net, sk, skb);
 384 }
 385
 386 unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 387 {
 388         unsigned int mtu;
 389         struct inet6_dev *idev;
 390
 391         if (dst_metric_locked(dst, RTAX_MTU)) {
 392                 mtu = dst_metric_raw(dst, RTAX_MTU);
 393                 if (mtu)
 394                         return mtu;
 395         }
 396
 397         mtu = IPV6_MIN_MTU;
 398         rcu_read_lock();
 399         idev = __in6_dev_get(dst->dev);
 400         if (idev)
 401                 mtu = idev->cnf.mtu6;
 402         rcu_read_unlock();
 403
 404         return mtu;
 405 }
 406 EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
 407
 408 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 409 {
 410         if (skb->len <= mtu)
 411                 return false;
 412
 413         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 414         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 415                 return true;
 416
 417         if (skb->ignore_df)
 418                 return false;
 419
 420         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 421                 return false;
 422
 423         return true;
 424 }
 425
 426 int ip6_forward(struct sk_buff *skb)
 427 {
 428         struct dst_entry *dst = skb_dst(skb);
 429         struct ipv6hdr *hdr = ipv6_hdr(skb);
 430         struct inet6_skb_parm *opt = IP6CB(skb);
 431         struct net *net = dev_net(dst->dev);
 432         u32 mtu;
 433
 434         if (net->ipv6.devconf_all->forwarding == 0)
 435                 goto error;
 436
 437         if (skb->pkt_type != PACKET_HOST)
 438                 goto drop;
 439
 440         if (unlikely(skb->sk))
 441                 goto drop;
 442
 443         if (skb_warn_if_lro(skb))
 444                 goto drop;
 445
 446         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 447                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 448                                 IPSTATS_MIB_INDISCARDS);
 449                 goto drop;
 450         }
 451
 452         skb_forward_csum(skb);
 453
 454         /*
 455          *      We DO NOT make any processing on
 456          *      RA packets, pushing them to user level AS IS
 457          *      without ane WARRANTY that application will be able
 458          *      to interpret them. The reason is that we
 459          *      cannot make anything clever here.
 460          *
 461          *      We are not end-node, so that if packet contains
 462          *      AH/ESP, we cannot make anything.
 463          *      Defragmentation also would be mistake, RA packets
 464          *      cannot be fragmented, because there is no warranty
 465          *      that different fragments will go along one path. --ANK
 466          */
 467         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 468                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 469                         return 0;
 470         }
 471
 472         /*
 473          *      check and decrement ttl
 474          */
 475         if (hdr->hop_limit <= 1) {
 476                 /* Force OUTPUT device used as source address */
 477                 skb->dev = dst->dev;
 478                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 479                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 480                                 IPSTATS_MIB_INHDRERRORS);
 481
 482                 kfree_skb(skb);
 483                 return -ETIMEDOUT;
 484         }
 485
 486         /* XXX: idev->cnf.proxy_ndp? */
 487         if (net->ipv6.devconf_all->proxy_ndp &&
 488             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 489                 int proxied = ip6_forward_proxy_check(skb);
 490                 if (proxied > 0)
 491                         return ip6_input(skb);
 492                 else if (proxied < 0) {
 493                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
 494                                         IPSTATS_MIB_INDISCARDS);
 495                         goto drop;
 496                 }
 497         }
 498
 499         if (!xfrm6_route_forward(skb)) {
 500                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 501                                 IPSTATS_MIB_INDISCARDS);
 502                 goto drop;
 503         }
 504         dst = skb_dst(skb);
 505
 506         /* IPv6 specs say nothing about it, but it is clear that we cannot
 507            send redirects to source routed frames.
 508            We don't send redirects to frames decapsulated from IPsec.
 509          */
 510         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 511                 struct in6_addr *target = NULL;
 512                 struct inet_peer *peer;
 513                 struct rt6_info *rt;
 514
 515                 /*
 516                  *      incoming and outgoing devices are the same
 517                  *      send a redirect.
 518                  */
 519
 520                 rt = (struct rt6_info *) dst;
 521                 if (rt->rt6i_flags & RTF_GATEWAY)
 522                         target = &rt->rt6i_gateway;
 523                 else
 524                         target = &hdr->daddr;
 525
 526                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 527
 528                 /* Limit redirects both by destination (here)
 529                    and by source (inside ndisc_send_redirect)
 530                  */
 531                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 532                         ndisc_send_redirect(skb, target);
 533                 if (peer)
 534                         inet_putpeer(peer);
 535         } else {
 536                 int addrtype = ipv6_addr_type(&hdr->saddr);
 537
 538                 /* This check is security critical. */
 539                 if (addrtype == IPV6_ADDR_ANY ||
 540                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 541                         goto error;
 542                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 543                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 544                                     ICMPV6_NOT_NEIGHBOUR, 0);
 545                         goto error;
 546                 }
 547         }
 548
 549         mtu = ip6_dst_mtu_forward(dst);
 550         if (mtu < IPV6_MIN_MTU)
 551                 mtu = IPV6_MIN_MTU;
 552
 553         if (ip6_pkt_too_big(skb, mtu)) {
 554                 /* Again, force OUTPUT device used as source address */
 555                 skb->dev = dst->dev;
 556                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 557                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 558                                 IPSTATS_MIB_INTOOBIGERRORS);
 559                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 560                                 IPSTATS_MIB_FRAGFAILS);
 561                 kfree_skb(skb);
 562                 return -EMSGSIZE;
 563         }
 564
 565         if (skb_cow(skb, dst->dev->hard_header_len)) {
 566                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 567                                 IPSTATS_MIB_OUTDISCARDS);
 568                 goto drop;
 569         }
 570
 571         hdr = ipv6_hdr(skb);
 572
 573         /* Mangling hops number delayed to point after skb COW */
 574
 575         hdr->hop_limit--;
 576
 577         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 578                        net, NULL, skb, skb->dev, dst->dev,
 579                        ip6_forward_finish);
 580
 581 error:
 582         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 583 drop:
 584         kfree_skb(skb);
 585         return -EINVAL;
 586 }
 587
 588 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 589 {
 590         to->pkt_type = from->pkt_type;
 591         to->priority = from->priority;
 592         to->protocol = from->protocol;
 593         skb_dst_drop(to);
 594         skb_dst_set(to, dst_clone(skb_dst(from)));
 595         to->dev = from->dev;
 596         to->mark = from->mark;
 597
 598 #ifdef CONFIG_NET_SCHED
 599         to->tc_index = from->tc_index;
 600 #endif
 601         nf_copy(to, from);
 602         skb_copy_secmark(to, from);
 603 }
 604
 605 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 606                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 607 {
 608         struct sk_buff *frag;
 609         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 610         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 611                                 inet6_sk(skb->sk) : NULL;
 612         struct ipv6hdr *tmp_hdr;
 613         struct frag_hdr *fh;
 614         unsigned int mtu, hlen, left, len;
 615         int hroom, troom;
 616         __be32 frag_id;
 617         int ptr, offset = 0, err = 0;
 618         u8 *prevhdr, nexthdr = 0;
 619
 620         err = ip6_find_1stfragopt(skb, &prevhdr);
 621         if (err < 0)
 622                 goto fail;
 623         hlen = err;
 624         nexthdr = *prevhdr;
 625
 626         mtu = ip6_skb_dst_mtu(skb);
 627
 628         /* We must not fragment if the socket is set to force MTU discovery
 629          * or if the skb it not generated by a local socket.
 630          */
 631         if (unlikely(!skb->ignore_df && skb->len > mtu))
 632                 goto fail_toobig;
 633
 634         if (IP6CB(skb)->frag_max_size) {
 635                 if (IP6CB(skb)->frag_max_size > mtu)
 636                         goto fail_toobig;
 637
 638                 /* don't send fragments larger than what we received */
 639                 mtu = IP6CB(skb)->frag_max_size;
 640                 if (mtu < IPV6_MIN_MTU)
 641                         mtu = IPV6_MIN_MTU;
 642         }
 643
 644         if (np && np->frag_size < mtu) {
 645                 if (np->frag_size)
 646                         mtu = np->frag_size;
 647         }
 648         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 649                 goto fail_toobig;
 650         mtu -= hlen + sizeof(struct frag_hdr);
 651
 652         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 653                                     &ipv6_hdr(skb)->saddr);
 654
 655         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 656             (err = skb_checksum_help(skb)))
 657                 goto fail;
 658
 659         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 660         if (skb_has_frag_list(skb)) {
 661                 unsigned int first_len = skb_pagelen(skb);
 662                 struct sk_buff *frag2;
 663
 664                 if (first_len - hlen > mtu ||
 665                     ((first_len - hlen) & 7) ||
 666                     skb_cloned(skb) ||
 667                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 668                         goto slow_path;
 669
 670                 skb_walk_frags(skb, frag) {
 671                         /* Correct geometry. */
 672                         if (frag->len > mtu ||
 673                             ((frag->len & 7) && frag->next) ||
 674                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 675                                 goto slow_path_clean;
 676
 677                         /* Partially cloned skb? */
 678                         if (skb_shared(frag))
 679                                 goto slow_path_clean;
 680
 681                         BUG_ON(frag->sk);
 682                         if (skb->sk) {
 683                                 frag->sk = skb->sk;
 684                                 frag->destructor = sock_wfree;
 685                         }
 686                         skb->truesize -= frag->truesize;
 687                 }
 688
 689                 err = 0;
 690                 offset = 0;
 691                 /* BUILD HEADER */
 692
 693                 *prevhdr = NEXTHDR_FRAGMENT;
 694                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 695                 if (!tmp_hdr) {
 696                         err = -ENOMEM;
 697                         goto fail;
 698                 }
 699                 frag = skb_shinfo(skb)->frag_list;
 700                 skb_frag_list_init(skb);
 701
 702                 __skb_pull(skb, hlen);
 703                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 704                 __skb_push(skb, hlen);
 705                 skb_reset_network_header(skb);
 706                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 707
 708                 fh->nexthdr = nexthdr;
 709                 fh->reserved = 0;
 710                 fh->frag_off = htons(IP6_MF);
 711                 fh->identification = frag_id;
 712
 713                 first_len = skb_pagelen(skb);
 714                 skb->data_len = first_len - skb_headlen(skb);
 715                 skb->len = first_len;
 716                 ipv6_hdr(skb)->payload_len = htons(first_len -
 717                                                    sizeof(struct ipv6hdr));
 718
 719                 for (;;) {
 720                         /* Prepare header of the next frame,
 721                          * before previous one went down. */
 722                         if (frag) {
 723                                 frag->ip_summed = CHECKSUM_NONE;
 724                                 skb_reset_transport_header(frag);
 725                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 726                                 __skb_push(frag, hlen);
 727                                 skb_reset_network_header(frag);
 728                                 memcpy(skb_network_header(frag), tmp_hdr,
 729                                        hlen);
 730                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 731                                 fh->nexthdr = nexthdr;
 732                                 fh->reserved = 0;
 733                                 fh->frag_off = htons(offset);
 734                                 if (frag->next)
 735                                         fh->frag_off |= htons(IP6_MF);
 736                                 fh->identification = frag_id;
 737                                 ipv6_hdr(frag)->payload_len =
 738                                                 htons(frag->len -
 739                                                       sizeof(struct ipv6hdr));
 740                                 ip6_copy_metadata(frag, skb);
 741                         }
 742
 743                         err = output(net, sk, skb);
 744                         if (!err)
 745                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 746                                               IPSTATS_MIB_FRAGCREATES);
 747
 748                         if (err || !frag)
 749                                 break;
 750
 751                         skb = frag;
 752                         frag = skb->next;
 753                         skb->next = NULL;
 754                 }
 755
 756                 kfree(tmp_hdr);
 757
 758                 if (err == 0) {
 759                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 760                                       IPSTATS_MIB_FRAGOKS);
 761                         return 0;
 762                 }
 763
 764                 kfree_skb_list(frag);
 765
 766                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 767                               IPSTATS_MIB_FRAGFAILS);
 768                 return err;
 769
 770 slow_path_clean:
 771                 skb_walk_frags(skb, frag2) {
 772                         if (frag2 == frag)
 773                                 break;
 774                         frag2->sk = NULL;
 775                         frag2->destructor = NULL;
 776                         skb->truesize += frag2->truesize;
 777                 }
 778         }
 779
 780 slow_path:
 781         left = skb->len - hlen;         /* Space per frame */
 782         ptr = hlen;                     /* Where to start from */
 783
 784         /*
 785          *      Fragment the datagram.
 786          */
 787
 788         troom = rt->dst.dev->needed_tailroom;
 789
 790         /*
 791          *      Keep copying data until we run out.
 792          */
 793         while (left > 0)        {
 794                 u8 *fragnexthdr_offset;
 795
 796                 len = left;
 797                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 798                 if (len > mtu)
 799                         len = mtu;
 800                 /* IF: we are not sending up to and including the packet end
 801                    then align the next start on an eight byte boundary */
 802                 if (len < left) {
 803                         len &= ~7;
 804                 }
 805
 806                 /* Allocate buffer */
 807                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 808                                  hroom + troom, GFP_ATOMIC);
 809                 if (!frag) {
 810                         err = -ENOMEM;
 811                         goto fail;
 812                 }
 813
 814                 /*
 815                  *      Set up data on packet
 816                  */
 817
 818                 ip6_copy_metadata(frag, skb);
 819                 skb_reserve(frag, hroom);
 820                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 821                 skb_reset_network_header(frag);
 822                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 823                 frag->transport_header = (frag->network_header + hlen +
 824                                           sizeof(struct frag_hdr));
 825
 826                 /*
 827                  *      Charge the memory for the fragment to any owner
 828                  *      it might possess
 829                  */
 830                 if (skb->sk)
 831                         skb_set_owner_w(frag, skb->sk);
 832
 833                 /*
 834                  *      Copy the packet header into the new buffer.
 835                  */
 836                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 837
 838                 fragnexthdr_offset = skb_network_header(frag);
 839                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 840                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 841
 842                 /*
 843                  *      Build fragment header.
 844                  */
 845                 fh->nexthdr = nexthdr;
 846                 fh->reserved = 0;
 847                 fh->identification = frag_id;
 848
 849                 /*
 850                  *      Copy a block of the IP datagram.
 851                  */
 852                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 853                                      len));
 854                 left -= len;
 855
 856                 fh->frag_off = htons(offset);
 857                 if (left > 0)
 858                         fh->frag_off |= htons(IP6_MF);
 859                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 860                                                     sizeof(struct ipv6hdr));
 861
 862                 ptr += len;
 863                 offset += len;
 864
 865                 /*
 866                  *      Put this fragment into the sending queue.
 867                  */
 868                 err = output(net, sk, frag);
 869                 if (err)
 870                         goto fail;
 871
 872                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 873                               IPSTATS_MIB_FRAGCREATES);
 874         }
 875         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 876                       IPSTATS_MIB_FRAGOKS);
 877         consume_skb(skb);
 878         return err;
 879
 880 fail_toobig:
 881         if (skb->sk && dst_allfrag(skb_dst(skb)))
 882                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 883
 884         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 885         err = -EMSGSIZE;
 886
 887 fail:
 888         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 889                       IPSTATS_MIB_FRAGFAILS);
 890         kfree_skb(skb);
 891         return err;
 892 }
 893
 894 static inline int ip6_rt_check(const struct rt6key *rt_key,
 895                                const struct in6_addr *fl_addr,
 896                                const struct in6_addr *addr_cache)
 897 {
 898         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 899                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 900 }
 901
 902 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 903                                           struct dst_entry *dst,
 904                                           const struct flowi6 *fl6)
 905 {
 906         struct ipv6_pinfo *np = inet6_sk(sk);
 907         struct rt6_info *rt;
 908
 909         if (!dst)
 910                 goto out;
 911
 912         if (dst->ops->family != AF_INET6) {
 913                 dst_release(dst);
 914                 return NULL;
 915         }
 916
 917         rt = (struct rt6_info *)dst;
 918         /* Yes, checking route validity in not connected
 919          * case is not very simple. Take into account,
 920          * that we do not support routing by source, TOS,
 921          * and MSG_DONTROUTE            --ANK (980726)
 922          *
 923          * 1. ip6_rt_check(): If route was host route,
 924          *    check that cached destination is current.
 925          *    If it is network route, we still may
 926          *    check its validity using saved pointer
 927          *    to the last used address: daddr_cache.
 928          *    We do not want to save whole address now,
 929          *    (because main consumer of this service
 930          *    is tcp, which has not this problem),
 931          *    so that the last trick works only on connected
 932          *    sockets.
 933          * 2. oif also should be the same.
 934          */
 935         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 936 #ifdef CONFIG_IPV6_SUBTREES
 937             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 938 #endif
 939            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 940               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 941                 dst_release(dst);
 942                 dst = NULL;
 943         }
 944
 945 out:
 946         return dst;
 947 }
 948
 949 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 950                                struct dst_entry **dst, struct flowi6 *fl6)
 951 {
 952 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 953         struct neighbour *n;
 954         struct rt6_info *rt;
 955 #endif
 956         int err;
 957         int flags = 0;
 958
 959         /* The correct way to handle this would be to do
 960          * ip6_route_get_saddr, and then ip6_route_output; however,
 961          * the route-specific preferred source forces the
 962          * ip6_route_output call _before_ ip6_route_get_saddr.
 963          *
 964          * In source specific routing (no src=any default route),
 965          * ip6_route_output will fail given src=any saddr, though, so
 966          * that's why we try it again later.
 967          */
 968         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 969                 struct rt6_info *rt;
 970                 bool had_dst = *dst != NULL;
 971
 972                 if (!had_dst)
 973                         *dst = ip6_route_output(net, sk, fl6);
 974                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 975                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 976                                           sk ? inet6_sk(sk)->srcprefs : 0,
 977                                           &fl6->saddr);
 978                 if (err)
 979                         goto out_err_release;
 980
 981                 /* If we had an erroneous initial result, pretend it
 982                  * never existed and let the SA-enabled version take
 983                  * over.
 984                  */
 985                 if (!had_dst && (*dst)->error) {
 986                         dst_release(*dst);
 987                         *dst = NULL;
 988                 }
 989
 990                 if (fl6->flowi6_oif)
 991                         flags |= RT6_LOOKUP_F_IFACE;
 992         }
 993
 994         if (!*dst)
 995                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 996
 997         err = (*dst)->error;
 998         if (err)
 999                 goto out_err_release;
1000
1001 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1002         /*
1003          * Here if the dst entry we've looked up
1004          * has a neighbour entry that is in the INCOMPLETE
1005          * state and the src address from the flow is
1006          * marked as OPTIMISTIC, we release the found
1007          * dst entry and replace it instead with the
1008          * dst entry of the nexthop router
1009          */
1010         rt = (struct rt6_info *) *dst;
1011         rcu_read_lock_bh();
1012         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1013                                       rt6_nexthop(rt, &fl6->daddr));
1014         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1015         rcu_read_unlock_bh();
1016
1017         if (err) {
1018                 struct inet6_ifaddr *ifp;
1019                 struct flowi6 fl_gw6;
1020                 int redirect;
1021
1022                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1023                                       (*dst)->dev, 1);
1024
1025                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1026                 if (ifp)
1027                         in6_ifa_put(ifp);
1028
1029                 if (redirect) {
1030                         /*
1031                          * We need to get the dst entry for the
1032                          * default router instead
1033                          */
1034                         dst_release(*dst);
1035                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1036                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1037                         *dst = ip6_route_output(net, sk, &fl_gw6);
1038                         err = (*dst)->error;
1039                         if (err)
1040                                 goto out_err_release;
1041                 }
1042         }
1043 #endif
1044         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1045             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1046                 err = -EAFNOSUPPORT;
1047                 goto out_err_release;
1048         }
1049
1050         return 0;
1051
1052 out_err_release:
1053         dst_release(*dst);
1054         *dst = NULL;
1055
1056         if (err == -ENETUNREACH)
1057                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1058         return err;
1059 }
1060
1061 /**
1062  *      ip6_dst_lookup - perform route lookup on flow
1063  *      @sk: socket which provides route info
1064  *      @dst: pointer to dst_entry * for result
1065  *      @fl6: flow to lookup
1066  *
1067  *      This function performs a route lookup on the given flow.
1068  *
1069  *      It returns zero on success, or a standard errno code on error.
1070  */
1071 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1072                    struct flowi6 *fl6)
1073 {
1074         *dst = NULL;
1075         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1076 }
1077 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1078
1079 /**
1080  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1081  *      @sk: socket which provides route info
1082  *      @fl6: flow to lookup
1083  *      @final_dst: final destination address for ipsec lookup
1084  *
1085  *      This function performs a route lookup on the given flow.
1086  *
1087  *      It returns a valid dst pointer on success, or a pointer encoded
1088  *      error code.
1089  */
1090 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1091                                       const struct in6_addr *final_dst)
1092 {
1093         struct dst_entry *dst = NULL;
1094         int err;
1095
1096         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1097         if (err)
1098                 return ERR_PTR(err);
1099         if (final_dst)
1100                 fl6->daddr = *final_dst;
1101
1102         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1103 }
1104 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1105
1106 /**
1107  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1108  *      @sk: socket which provides the dst cache and route info
1109  *      @fl6: flow to lookup
1110  *      @final_dst: final destination address for ipsec lookup
1111  *      @connected: whether @sk is connected or not
1112  *
1113  *      This function performs a route lookup on the given flow with the
1114  *      possibility of using the cached route in the socket if it is valid.
1115  *      It will take the socket dst lock when operating on the dst cache.
1116  *      As a result, this function can only be used in process context.
1117  *
1118  *      In addition, for a connected socket, cache the dst in the socket
1119  *      if the current cache is not valid.
1120  *
1121  *      It returns a valid dst pointer on success, or a pointer encoded
1122  *      error code.
1123  */
1124 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1125                                          const struct in6_addr *final_dst,
1126                                          bool connected)
1127 {
1128         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1129
1130         dst = ip6_sk_dst_check(sk, dst, fl6);
1131         if (dst)
1132                 return dst;
1133
1134         dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1135         if (connected && !IS_ERR(dst))
1136                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1137
1138         return dst;
1139 }
1140 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1141
1142 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1143                                                gfp_t gfp)
1144 {
1145         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1146 }
1147
1148 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1149                                                 gfp_t gfp)
1150 {
1151         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1152 }
1153
1154 static void ip6_append_data_mtu(unsigned int *mtu,
1155                                 int *maxfraglen,
1156                                 unsigned int fragheaderlen,
1157                                 struct sk_buff *skb,
1158                                 struct rt6_info *rt,
1159                                 unsigned int orig_mtu)
1160 {
1161         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1162                 if (!skb) {
1163                         /* first fragment, reserve header_len */
1164                         *mtu = orig_mtu - rt->dst.header_len;
1165
1166                 } else {
1167                         /*
1168                          * this fragment is not first, the headers
1169                          * space is regarded as data space.
1170                          */
1171                         *mtu = orig_mtu;
1172                 }
1173                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1174                               + fragheaderlen - sizeof(struct frag_hdr);
1175         }
1176 }
1177
1178 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1179                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1180                           struct rt6_info *rt, struct flowi6 *fl6)
1181 {
1182         struct ipv6_pinfo *np = inet6_sk(sk);
1183         unsigned int mtu;
1184         struct ipv6_txoptions *opt = ipc6->opt;
1185
1186         /*
1187          * setup for corking
1188          */
1189         if (opt) {
1190                 if (WARN_ON(v6_cork->opt))
1191                         return -EINVAL;
1192
1193                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1194                 if (unlikely(!v6_cork->opt))
1195                         return -ENOBUFS;
1196
1197                 v6_cork->opt->tot_len = sizeof(*opt);
1198                 v6_cork->opt->opt_flen = opt->opt_flen;
1199                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1200
1201                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1202                                                     sk->sk_allocation);
1203                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1204                         return -ENOBUFS;
1205
1206                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1207                                                     sk->sk_allocation);
1208                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1209                         return -ENOBUFS;
1210
1211                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1212                                                    sk->sk_allocation);
1213                 if (opt->hopopt && !v6_cork->opt->hopopt)
1214                         return -ENOBUFS;
1215
1216                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1217                                                     sk->sk_allocation);
1218                 if (opt->srcrt && !v6_cork->opt->srcrt)
1219                         return -ENOBUFS;
1220
1221                 /* need source address above miyazawa*/
1222         }
1223         dst_hold(&rt->dst);
1224         cork->base.dst = &rt->dst;
1225         cork->fl.u.ip6 = *fl6;
1226         v6_cork->hop_limit = ipc6->hlimit;
1227         v6_cork->tclass = ipc6->tclass;
1228         if (rt->dst.flags & DST_XFRM_TUNNEL)
1229                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1230                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1231         else
1232                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1233                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1234         if (np->frag_size < mtu) {
1235                 if (np->frag_size)
1236                         mtu = np->frag_size;
1237         }
1238         if (mtu < IPV6_MIN_MTU)
1239                 return -EINVAL;
1240         cork->base.fragsize = mtu;
1241         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1242                 cork->base.flags |= IPCORK_ALLFRAG;
1243         cork->base.length = 0;
1244
1245         return 0;
1246 }
1247
1248 static int __ip6_append_data(struct sock *sk,
1249                              struct flowi6 *fl6,
1250                              struct sk_buff_head *queue,
1251                              struct inet_cork *cork,
1252                              struct inet6_cork *v6_cork,
1253                              struct page_frag *pfrag,
1254                              int getfrag(void *from, char *to, int offset,
1255                                          int len, int odd, struct sk_buff *skb),
1256                              void *from, int length, int transhdrlen,
1257                              unsigned int flags, struct ipcm6_cookie *ipc6,
1258                              const struct sockcm_cookie *sockc)
1259 {
1260         struct sk_buff *skb, *skb_prev = NULL;
1261         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1262         int exthdrlen = 0;
1263         int dst_exthdrlen = 0;
1264         int hh_len;
1265         int copy;
1266         int err;
1267         int offset = 0;
1268         __u8 tx_flags = 0;
1269         u32 tskey = 0;
1270         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1271         struct ipv6_txoptions *opt = v6_cork->opt;
1272         int csummode = CHECKSUM_NONE;
1273         unsigned int maxnonfragsize, headersize;
1274         unsigned int wmem_alloc_delta = 0;
1275
1276         skb = skb_peek_tail(queue);
1277         if (!skb) {
1278                 exthdrlen = opt ? opt->opt_flen : 0;
1279                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1280         }
1281
1282         mtu = cork->fragsize;
1283         orig_mtu = mtu;
1284
1285         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1286
1287         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1288                         (opt ? opt->opt_nflen : 0);
1289         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1290                      sizeof(struct frag_hdr);
1291
1292         headersize = sizeof(struct ipv6hdr) +
1293                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1294                      (dst_allfrag(&rt->dst) ?
1295                       sizeof(struct frag_hdr) : 0) +
1296                      rt->rt6i_nfheader_len;
1297
1298         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1299          * the first fragment
1300          */
1301         if (headersize + transhdrlen > mtu)
1302                 goto emsgsize;
1303
1304         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1305             (sk->sk_protocol == IPPROTO_UDP ||
1306              sk->sk_protocol == IPPROTO_RAW)) {
1307                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1308                                 sizeof(struct ipv6hdr));
1309                 goto emsgsize;
1310         }
1311
1312         if (ip6_sk_ignore_df(sk))
1313                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1314         else
1315                 maxnonfragsize = mtu;
1316
1317         if (cork->length + length > maxnonfragsize - headersize) {
1318 emsgsize:
1319                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1320                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1321                 return -EMSGSIZE;
1322         }
1323
1324         /* CHECKSUM_PARTIAL only with no extension headers and when
1325          * we are not going to fragment
1326          */
1327         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1328             headersize == sizeof(struct ipv6hdr) &&
1329             length <= mtu - headersize &&
1330             !(flags & MSG_MORE) &&
1331             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1332                 csummode = CHECKSUM_PARTIAL;
1333
1334         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1335                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1336                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1337                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1338                         tskey = sk->sk_tskey++;
1339         }
1340
1341         /*
1342          * Let's try using as much space as possible.
1343          * Use MTU if total length of the message fits into the MTU.
1344          * Otherwise, we need to reserve fragment header and
1345          * fragment alignment (= 8-15 octects, in total).
1346          *
1347          * Note that we may need to "move" the data from the tail of
1348          * of the buffer to the new fragment when we split
1349          * the message.
1350          *
1351          * FIXME: It may be fragmented into multiple chunks
1352          *        at once if non-fragmentable extension headers
1353          *        are too large.
1354          * --yoshfuji
1355          */
1356
1357         cork->length += length;
1358         if (!skb)
1359                 goto alloc_new_skb;
1360
1361         while (length > 0) {
1362                 /* Check if the remaining data fits into current packet. */
1363                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1364                 if (copy < length)
1365                         copy = maxfraglen - skb->len;
1366
1367                 if (copy <= 0) {
1368                         char *data;
1369                         unsigned int datalen;
1370                         unsigned int fraglen;
1371                         unsigned int fraggap;
1372                         unsigned int alloclen;
1373 alloc_new_skb:
1374                         /* There's no room in the current skb */
1375                         if (skb)
1376                                 fraggap = skb->len - maxfraglen;
1377                         else
1378                                 fraggap = 0;
1379                         /* update mtu and maxfraglen if necessary */
1380                         if (!skb || !skb_prev)
1381                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1382                                                     fragheaderlen, skb, rt,
1383                                                     orig_mtu);
1384
1385                         skb_prev = skb;
1386
1387                         /*
1388                          * If remaining data exceeds the mtu,
1389                          * we know we need more fragment(s).
1390                          */
1391                         datalen = length + fraggap;
1392
1393                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1394                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1395                         if ((flags & MSG_MORE) &&
1396                             !(rt->dst.dev->features&NETIF_F_SG))
1397                                 alloclen = mtu;
1398                         else
1399                                 alloclen = datalen + fragheaderlen;
1400
1401                         alloclen += dst_exthdrlen;
1402
1403                         if (datalen != length + fraggap) {
1404                                 /*
1405                                  * this is not the last fragment, the trailer
1406                                  * space is regarded as data space.
1407                                  */
1408                                 datalen += rt->dst.trailer_len;
1409                         }
1410
1411                         alloclen += rt->dst.trailer_len;
1412                         fraglen = datalen + fragheaderlen;
1413
1414                         /*
1415                          * We just reserve space for fragment header.
1416                          * Note: this may be overallocation if the message
1417                          * (without MSG_MORE) fits into the MTU.
1418                          */
1419                         alloclen += sizeof(struct frag_hdr);
1420
1421                         copy = datalen - transhdrlen - fraggap;
1422                         if (copy < 0) {
1423                                 err = -EINVAL;
1424                                 goto error;
1425                         }
1426                         if (transhdrlen) {
1427                                 skb = sock_alloc_send_skb(sk,
1428                                                 alloclen + hh_len,
1429                                                 (flags & MSG_DONTWAIT), &err);
1430                         } else {
1431                                 skb = NULL;
1432                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1433                                     2 * sk->sk_sndbuf)
1434                                         skb = alloc_skb(alloclen + hh_len,
1435                                                         sk->sk_allocation);
1436                                 if (unlikely(!skb))
1437                                         err = -ENOBUFS;
1438                         }
1439                         if (!skb)
1440                                 goto error;
1441                         /*
1442                          *      Fill in the control structures
1443                          */
1444                         skb->protocol = htons(ETH_P_IPV6);
1445                         skb->ip_summed = csummode;
1446                         skb->csum = 0;
1447                         /* reserve for fragmentation and ipsec header */
1448                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1449                                     dst_exthdrlen);
1450
1451                         /* Only the initial fragment is time stamped */
1452                         skb_shinfo(skb)->tx_flags = tx_flags;
1453                         tx_flags = 0;
1454                         skb_shinfo(skb)->tskey = tskey;
1455                         tskey = 0;
1456
1457                         /*
1458                          *      Find where to start putting bytes
1459                          */
1460                         data = skb_put(skb, fraglen);
1461                         skb_set_network_header(skb, exthdrlen);
1462                         data += fragheaderlen;
1463                         skb->transport_header = (skb->network_header +
1464                                                  fragheaderlen);
1465                         if (fraggap) {
1466                                 skb->csum = skb_copy_and_csum_bits(
1467                                         skb_prev, maxfraglen,
1468                                         data + transhdrlen, fraggap, 0);
1469                                 skb_prev->csum = csum_sub(skb_prev->csum,
1470                                                           skb->csum);
1471                                 data += fraggap;
1472                                 pskb_trim_unique(skb_prev, maxfraglen);
1473                         }
1474                         if (copy > 0 &&
1475                             getfrag(from, data + transhdrlen, offset,
1476                                     copy, fraggap, skb) < 0) {
1477                                 err = -EFAULT;
1478                                 kfree_skb(skb);
1479                                 goto error;
1480                         }
1481
1482                         offset += copy;
1483                         length -= datalen - fraggap;
1484                         transhdrlen = 0;
1485                         exthdrlen = 0;
1486                         dst_exthdrlen = 0;
1487
1488                         if ((flags & MSG_CONFIRM) && !skb_prev)
1489                                 skb_set_dst_pending_confirm(skb, 1);
1490
1491                         /*
1492                          * Put the packet on the pending queue
1493                          */
1494                         if (!skb->destructor) {
1495                                 skb->destructor = sock_wfree;
1496                                 skb->sk = sk;
1497                                 wmem_alloc_delta += skb->truesize;
1498                         }
1499                         __skb_queue_tail(queue, skb);
1500                         continue;
1501                 }
1502
1503                 if (copy > length)
1504                         copy = length;
1505
1506                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1507                         unsigned int off;
1508
1509                         off = skb->len;
1510                         if (getfrag(from, skb_put(skb, copy),
1511                                                 offset, copy, off, skb) < 0) {
1512                                 __skb_trim(skb, off);
1513                                 err = -EFAULT;
1514                                 goto error;
1515                         }
1516                 } else {
1517                         int i = skb_shinfo(skb)->nr_frags;
1518
1519                         err = -ENOMEM;
1520                         if (!sk_page_frag_refill(sk, pfrag))
1521                                 goto error;
1522
1523                         if (!skb_can_coalesce(skb, i, pfrag->page,
1524                                               pfrag->offset)) {
1525                                 err = -EMSGSIZE;
1526                                 if (i == MAX_SKB_FRAGS)
1527                                         goto error;
1528
1529                                 __skb_fill_page_desc(skb, i, pfrag->page,
1530                                                      pfrag->offset, 0);
1531                                 skb_shinfo(skb)->nr_frags = ++i;
1532                                 get_page(pfrag->page);
1533                         }
1534                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1535                         if (getfrag(from,
1536                                     page_address(pfrag->page) + pfrag->offset,
1537                                     offset, copy, skb->len, skb) < 0)
1538                                 goto error_efault;
1539
1540                         pfrag->offset += copy;
1541                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1542                         skb->len += copy;
1543                         skb->data_len += copy;
1544                         skb->truesize += copy;
1545                         wmem_alloc_delta += copy;
1546                 }
1547                 offset += copy;
1548                 length -= copy;
1549         }
1550
1551         if (wmem_alloc_delta)
1552                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1553         return 0;
1554
1555 error_efault:
1556         err = -EFAULT;
1557 error:
1558         cork->length -= length;
1559         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1560         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1561         return err;
1562 }
1563
1564 int ip6_append_data(struct sock *sk,
1565                     int getfrag(void *from, char *to, int offset, int len,
1566                                 int odd, struct sk_buff *skb),
1567                     void *from, int length, int transhdrlen,
1568                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1569                     struct rt6_info *rt, unsigned int flags,
1570                     const struct sockcm_cookie *sockc)
1571 {
1572         struct inet_sock *inet = inet_sk(sk);
1573         struct ipv6_pinfo *np = inet6_sk(sk);
1574         int exthdrlen;
1575         int err;
1576
1577         if (flags&MSG_PROBE)
1578                 return 0;
1579         if (skb_queue_empty(&sk->sk_write_queue)) {
1580                 /*
1581                  * setup for corking
1582                  */
1583                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1584                                      ipc6, rt, fl6);
1585                 if (err)
1586                         return err;
1587
1588                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1589                 length += exthdrlen;
1590                 transhdrlen += exthdrlen;
1591         } else {
1592                 fl6 = &inet->cork.fl.u.ip6;
1593                 transhdrlen = 0;
1594         }
1595
1596         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1597                                  &np->cork, sk_page_frag(sk), getfrag,
1598                                  from, length, transhdrlen, flags, ipc6, sockc);
1599 }
1600 EXPORT_SYMBOL_GPL(ip6_append_data);
1601
1602 static void ip6_cork_release(struct inet_cork_full *cork,
1603                              struct inet6_cork *v6_cork)
1604 {
1605         if (v6_cork->opt) {
1606                 kfree(v6_cork->opt->dst0opt);
1607                 kfree(v6_cork->opt->dst1opt);
1608                 kfree(v6_cork->opt->hopopt);
1609                 kfree(v6_cork->opt->srcrt);
1610                 kfree(v6_cork->opt);
1611                 v6_cork->opt = NULL;
1612         }
1613
1614         if (cork->base.dst) {
1615                 dst_release(cork->base.dst);
1616                 cork->base.dst = NULL;
1617                 cork->base.flags &= ~IPCORK_ALLFRAG;
1618         }
1619         memset(&cork->fl, 0, sizeof(cork->fl));
1620 }
1621
1622 struct sk_buff *__ip6_make_skb(struct sock *sk,
1623                                struct sk_buff_head *queue,
1624                                struct inet_cork_full *cork,
1625                                struct inet6_cork *v6_cork)
1626 {
1627         struct sk_buff *skb, *tmp_skb;
1628         struct sk_buff **tail_skb;
1629         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1630         struct ipv6_pinfo *np = inet6_sk(sk);
1631         struct net *net = sock_net(sk);
1632         struct ipv6hdr *hdr;
1633         struct ipv6_txoptions *opt = v6_cork->opt;
1634         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1635         struct flowi6 *fl6 = &cork->fl.u.ip6;
1636         unsigned char proto = fl6->flowi6_proto;
1637
1638         skb = __skb_dequeue(queue);
1639         if (!skb)
1640                 goto out;
1641         tail_skb = &(skb_shinfo(skb)->frag_list);
1642
1643         /* move skb->data to ip header from ext header */
1644         if (skb->data < skb_network_header(skb))
1645                 __skb_pull(skb, skb_network_offset(skb));
1646         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1647                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1648                 *tail_skb = tmp_skb;
1649                 tail_skb = &(tmp_skb->next);
1650                 skb->len += tmp_skb->len;
1651                 skb->data_len += tmp_skb->len;
1652                 skb->truesize += tmp_skb->truesize;
1653                 tmp_skb->destructor = NULL;
1654                 tmp_skb->sk = NULL;
1655         }
1656
1657         /* Allow local fragmentation. */
1658         skb->ignore_df = ip6_sk_ignore_df(sk);
1659
1660         *final_dst = fl6->daddr;
1661         __skb_pull(skb, skb_network_header_len(skb));
1662         if (opt && opt->opt_flen)
1663                 ipv6_push_frag_opts(skb, opt, &proto);
1664         if (opt && opt->opt_nflen)
1665                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1666
1667         skb_push(skb, sizeof(struct ipv6hdr));
1668         skb_reset_network_header(skb);
1669         hdr = ipv6_hdr(skb);
1670
1671         ip6_flow_hdr(hdr, v6_cork->tclass,
1672                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1673                                         ip6_autoflowlabel(net, np), fl6));
1674         hdr->hop_limit = v6_cork->hop_limit;
1675         hdr->nexthdr = proto;
1676         hdr->saddr = fl6->saddr;
1677         hdr->daddr = *final_dst;
1678
1679         skb->priority = sk->sk_priority;
1680         skb->mark = sk->sk_mark;
1681
1682         skb_dst_set(skb, dst_clone(&rt->dst));
1683         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1684         if (proto == IPPROTO_ICMPV6) {
1685                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1686
1687                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1688                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1689         }
1690
1691         ip6_cork_release(cork, v6_cork);
1692 out:
1693         return skb;
1694 }
1695
1696 int ip6_send_skb(struct sk_buff *skb)
1697 {
1698         struct net *net = sock_net(skb->sk);
1699         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1700         int err;
1701
1702         err = ip6_local_out(net, skb->sk, skb);
1703         if (err) {
1704                 if (err > 0)
1705                         err = net_xmit_errno(err);
1706                 if (err)
1707                         IP6_INC_STATS(net, rt->rt6i_idev,
1708                                       IPSTATS_MIB_OUTDISCARDS);
1709         }
1710
1711         return err;
1712 }
1713
1714 int ip6_push_pending_frames(struct sock *sk)
1715 {
1716         struct sk_buff *skb;
1717
1718         skb = ip6_finish_skb(sk);
1719         if (!skb)
1720                 return 0;
1721
1722         return ip6_send_skb(skb);
1723 }
1724 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1725
1726 static void __ip6_flush_pending_frames(struct sock *sk,
1727                                        struct sk_buff_head *queue,
1728                                        struct inet_cork_full *cork,
1729                                        struct inet6_cork *v6_cork)
1730 {
1731         struct sk_buff *skb;
1732
1733         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1734                 if (skb_dst(skb))
1735                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1736                                       IPSTATS_MIB_OUTDISCARDS);
1737                 kfree_skb(skb);
1738         }
1739
1740         ip6_cork_release(cork, v6_cork);
1741 }
1742
1743 void ip6_flush_pending_frames(struct sock *sk)
1744 {
1745         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1746                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1747 }
1748 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1749
1750 struct sk_buff *ip6_make_skb(struct sock *sk,
1751                              int getfrag(void *from, char *to, int offset,
1752                                          int len, int odd, struct sk_buff *skb),
1753                              void *from, int length, int transhdrlen,
1754                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1755                              struct rt6_info *rt, unsigned int flags,
1756                              const struct sockcm_cookie *sockc)
1757 {
1758         struct inet_cork_full cork;
1759         struct inet6_cork v6_cork;
1760         struct sk_buff_head queue;
1761         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1762         int err;
1763
1764         if (flags & MSG_PROBE)
1765                 return NULL;
1766
1767         __skb_queue_head_init(&queue);
1768
1769         cork.base.flags = 0;
1770         cork.base.addr = 0;
1771         cork.base.opt = NULL;
1772         cork.base.dst = NULL;
1773         v6_cork.opt = NULL;
1774         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1775         if (err) {
1776                 ip6_cork_release(&cork, &v6_cork);
1777                 return ERR_PTR(err);
1778         }
1779         if (ipc6->dontfrag < 0)
1780                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1781
1782         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1783                                 &current->task_frag, getfrag, from,
1784                                 length + exthdrlen, transhdrlen + exthdrlen,
1785                                 flags, ipc6, sockc);
1786         if (err) {
1787                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1788                 return ERR_PTR(err);
1789         }
1790
1791         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1792 }