net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58 #include <net/l3mdev.h>
  59
  60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  61 {
  62         struct dst_entry *dst = skb_dst(skb);
  63         struct net_device *dev = dst->dev;
  64         struct neighbour *neigh;
  65         struct in6_addr *nexthop;
  66         int ret;
  67
  68         skb->protocol = htons(ETH_P_IPV6);
  69         skb->dev = dev;
  70
  71         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  72                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  73
  74                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  75                     ((mroute6_socket(net, skb) &&
  76                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  77                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  78                                          &ipv6_hdr(skb)->saddr))) {
  79                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  80
  81                         /* Do not check for IFF_ALLMULTI; multicast routing
  82                            is not supported in any case.
  83                          */
  84                         if (newskb)
  85                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  86                                         net, sk, newskb, NULL, newskb->dev,
  87                                         dev_loopback_xmit);
  88
  89                         if (ipv6_hdr(skb)->hop_limit == 0) {
  90                                 IP6_INC_STATS(net, idev,
  91                                               IPSTATS_MIB_OUTDISCARDS);
  92                                 kfree_skb(skb);
  93                                 return 0;
  94                         }
  95                 }
  96
  97                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  98
  99                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 100                     IPV6_ADDR_SCOPE_NODELOCAL &&
 101                     !(dev->flags & IFF_LOOPBACK)) {
 102                         kfree_skb(skb);
 103                         return 0;
 104                 }
 105         }
 106
 107         rcu_read_lock_bh();
 108         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 109         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 110         if (unlikely(!neigh))
 111                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 112         if (!IS_ERR(neigh)) {
 113                 ret = dst_neigh_output(dst, neigh, skb);
 114                 rcu_read_unlock_bh();
 115                 return ret;
 116         }
 117         rcu_read_unlock_bh();
 118
 119         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 120         kfree_skb(skb);
 121         return -EINVAL;
 122 }
 123
 124 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 125 {
 126         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 127             dst_allfrag(skb_dst(skb)) ||
 128             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 129                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 130         else
 131                 return ip6_finish_output2(net, sk, skb);
 132 }
 133
 134 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 135 {
 136         struct net_device *dev = skb_dst(skb)->dev;
 137         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 138
 139         if (unlikely(idev->cnf.disable_ipv6)) {
 140                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 141                 kfree_skb(skb);
 142                 return 0;
 143         }
 144
 145         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 146                             net, sk, skb, NULL, dev,
 147                             ip6_finish_output,
 148                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 149 }
 150
 151 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 152 {
 153         if (!np->autoflowlabel_set)
 154                 return ip6_default_np_autolabel(net);
 155         else
 156                 return np->autoflowlabel;
 157 }
 158
 159 /*
 160  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 161  * Note : socket lock is not held for SYNACK packets, but might be modified
 162  * by calls to skb_set_owner_w() and ipv6_local_error(),
 163  * which are using proper atomic operations or spinlocks.
 164  */
 165 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 166              struct ipv6_txoptions *opt, int tclass)
 167 {
 168         struct net *net = sock_net(sk);
 169         const struct ipv6_pinfo *np = inet6_sk(sk);
 170         struct in6_addr *first_hop = &fl6->daddr;
 171         struct dst_entry *dst = skb_dst(skb);
 172         struct ipv6hdr *hdr;
 173         u8  proto = fl6->flowi6_proto;
 174         int seg_len = skb->len;
 175         int hlimit = -1;
 176         u32 mtu;
 177
 178         if (opt) {
 179                 unsigned int head_room;
 180
 181                 /* First: exthdrs may take lots of space (~8K for now)
 182                    MAX_HEADER is not enough.
 183                  */
 184                 head_room = opt->opt_nflen + opt->opt_flen;
 185                 seg_len += head_room;
 186                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 187
 188                 if (skb_headroom(skb) < head_room) {
 189                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 190                         if (!skb2) {
 191                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 192                                               IPSTATS_MIB_OUTDISCARDS);
 193                                 kfree_skb(skb);
 194                                 return -ENOBUFS;
 195                         }
 196                         if (skb->sk)
 197                                 skb_set_owner_w(skb2, skb->sk);
 198                         consume_skb(skb);
 199                         skb = skb2;
 200                 }
 201                 if (opt->opt_flen)
 202                         ipv6_push_frag_opts(skb, opt, &proto);
 203                 if (opt->opt_nflen)
 204                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 205         }
 206
 207         skb_push(skb, sizeof(struct ipv6hdr));
 208         skb_reset_network_header(skb);
 209         hdr = ipv6_hdr(skb);
 210
 211         /*
 212          *      Fill in the IPv6 header
 213          */
 214         if (np)
 215                 hlimit = np->hop_limit;
 216         if (hlimit < 0)
 217                 hlimit = ip6_dst_hoplimit(dst);
 218
 219         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 220                                 ip6_autoflowlabel(net, np), fl6));
 221
 222         hdr->payload_len = htons(seg_len);
 223         hdr->nexthdr = proto;
 224         hdr->hop_limit = hlimit;
 225
 226         hdr->saddr = fl6->saddr;
 227         hdr->daddr = *first_hop;
 228
 229         skb->protocol = htons(ETH_P_IPV6);
 230         skb->priority = sk->sk_priority;
 231         skb->mark = sk->sk_mark;
 232
 233         mtu = dst_mtu(dst);
 234         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 235                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 236                               IPSTATS_MIB_OUT, skb->len);
 237                 /* hooks should never assume socket lock is held.
 238                  * we promote our socket to non const
 239                  */
 240                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 241                                net, (struct sock *)sk, skb, NULL, dst->dev,
 242                                dst_output);
 243         }
 244
 245         skb->dev = dst->dev;
 246         /* ipv6_local_error() does not require socket lock,
 247          * we promote our socket to non const
 248          */
 249         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 250
 251         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 252         kfree_skb(skb);
 253         return -EMSGSIZE;
 254 }
 255 EXPORT_SYMBOL(ip6_xmit);
 256
 257 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 258 {
 259         struct ip6_ra_chain *ra;
 260         struct sock *last = NULL;
 261
 262         read_lock(&ip6_ra_lock);
 263         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 264                 struct sock *sk = ra->sk;
 265                 if (sk && ra->sel == sel &&
 266                     (!sk->sk_bound_dev_if ||
 267                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 268                         if (last) {
 269                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 270                                 if (skb2)
 271                                         rawv6_rcv(last, skb2);
 272                         }
 273                         last = sk;
 274                 }
 275         }
 276
 277         if (last) {
 278                 rawv6_rcv(last, skb);
 279                 read_unlock(&ip6_ra_lock);
 280                 return 1;
 281         }
 282         read_unlock(&ip6_ra_lock);
 283         return 0;
 284 }
 285
 286 static int ip6_forward_proxy_check(struct sk_buff *skb)
 287 {
 288         struct ipv6hdr *hdr = ipv6_hdr(skb);
 289         u8 nexthdr = hdr->nexthdr;
 290         __be16 frag_off;
 291         int offset;
 292
 293         if (ipv6_ext_hdr(nexthdr)) {
 294                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 295                 if (offset < 0)
 296                         return 0;
 297         } else
 298                 offset = sizeof(struct ipv6hdr);
 299
 300         if (nexthdr == IPPROTO_ICMPV6) {
 301                 struct icmp6hdr *icmp6;
 302
 303                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 304                                          offset + 1 - skb->data)))
 305                         return 0;
 306
 307                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 308
 309                 switch (icmp6->icmp6_type) {
 310                 case NDISC_ROUTER_SOLICITATION:
 311                 case NDISC_ROUTER_ADVERTISEMENT:
 312                 case NDISC_NEIGHBOUR_SOLICITATION:
 313                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 314                 case NDISC_REDIRECT:
 315                         /* For reaction involving unicast neighbor discovery
 316                          * message destined to the proxied address, pass it to
 317                          * input function.
 318                          */
 319                         return 1;
 320                 default:
 321                         break;
 322                 }
 323         }
 324
 325         /*
 326          * The proxying router can't forward traffic sent to a link-local
 327          * address, so signal the sender and discard the packet. This
 328          * behavior is clarified by the MIPv6 specification.
 329          */
 330         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 331                 dst_link_failure(skb);
 332                 return -1;
 333         }
 334
 335         return 0;
 336 }
 337
 338 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 339                                      struct sk_buff *skb)
 340 {
 341         struct dst_entry *dst = skb_dst(skb);
 342
 343         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 344         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 345         skb_sender_cpu_clear(skb);
 346         return dst_output(net, sk, skb);
 347 }
 348
 349 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 350 {
 351         unsigned int mtu;
 352         struct inet6_dev *idev;
 353
 354         if (dst_metric_locked(dst, RTAX_MTU)) {
 355                 mtu = dst_metric_raw(dst, RTAX_MTU);
 356                 if (mtu)
 357                         return mtu;
 358         }
 359
 360         mtu = IPV6_MIN_MTU;
 361         rcu_read_lock();
 362         idev = __in6_dev_get(dst->dev);
 363         if (idev)
 364                 mtu = idev->cnf.mtu6;
 365         rcu_read_unlock();
 366
 367         return mtu;
 368 }
 369
 370 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 371 {
 372         if (skb->len <= mtu)
 373                 return false;
 374
 375         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 376         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 377                 return true;
 378
 379         if (skb->ignore_df)
 380                 return false;
 381
 382         if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
 383                 return false;
 384
 385         return true;
 386 }
 387
 388 int ip6_forward(struct sk_buff *skb)
 389 {
 390         struct dst_entry *dst = skb_dst(skb);
 391         struct ipv6hdr *hdr = ipv6_hdr(skb);
 392         struct inet6_skb_parm *opt = IP6CB(skb);
 393         struct net *net = dev_net(dst->dev);
 394         u32 mtu;
 395
 396         if (net->ipv6.devconf_all->forwarding == 0)
 397                 goto error;
 398
 399         if (skb->pkt_type != PACKET_HOST)
 400                 goto drop;
 401
 402         if (unlikely(skb->sk))
 403                 goto drop;
 404
 405         if (skb_warn_if_lro(skb))
 406                 goto drop;
 407
 408         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 409                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 410                                  IPSTATS_MIB_INDISCARDS);
 411                 goto drop;
 412         }
 413
 414         skb_forward_csum(skb);
 415
 416         /*
 417          *      We DO NOT make any processing on
 418          *      RA packets, pushing them to user level AS IS
 419          *      without ane WARRANTY that application will be able
 420          *      to interpret them. The reason is that we
 421          *      cannot make anything clever here.
 422          *
 423          *      We are not end-node, so that if packet contains
 424          *      AH/ESP, we cannot make anything.
 425          *      Defragmentation also would be mistake, RA packets
 426          *      cannot be fragmented, because there is no warranty
 427          *      that different fragments will go along one path. --ANK
 428          */
 429         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 430                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 431                         return 0;
 432         }
 433
 434         /*
 435          *      check and decrement ttl
 436          */
 437         if (hdr->hop_limit <= 1) {
 438                 /* Force OUTPUT device used as source address */
 439                 skb->dev = dst->dev;
 440                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 441                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 442                                  IPSTATS_MIB_INHDRERRORS);
 443
 444                 kfree_skb(skb);
 445                 return -ETIMEDOUT;
 446         }
 447
 448         /* XXX: idev->cnf.proxy_ndp? */
 449         if (net->ipv6.devconf_all->proxy_ndp &&
 450             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 451                 int proxied = ip6_forward_proxy_check(skb);
 452                 if (proxied > 0)
 453                         return ip6_input(skb);
 454                 else if (proxied < 0) {
 455                         IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 456                                          IPSTATS_MIB_INDISCARDS);
 457                         goto drop;
 458                 }
 459         }
 460
 461         if (!xfrm6_route_forward(skb)) {
 462                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 463                                  IPSTATS_MIB_INDISCARDS);
 464                 goto drop;
 465         }
 466         dst = skb_dst(skb);
 467
 468         /* IPv6 specs say nothing about it, but it is clear that we cannot
 469            send redirects to source routed frames.
 470            We don't send redirects to frames decapsulated from IPsec.
 471          */
 472         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 473                 struct in6_addr *target = NULL;
 474                 struct inet_peer *peer;
 475                 struct rt6_info *rt;
 476
 477                 /*
 478                  *      incoming and outgoing devices are the same
 479                  *      send a redirect.
 480                  */
 481
 482                 rt = (struct rt6_info *) dst;
 483                 if (rt->rt6i_flags & RTF_GATEWAY)
 484                         target = &rt->rt6i_gateway;
 485                 else
 486                         target = &hdr->daddr;
 487
 488                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 489
 490                 /* Limit redirects both by destination (here)
 491                    and by source (inside ndisc_send_redirect)
 492                  */
 493                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 494                         ndisc_send_redirect(skb, target);
 495                 if (peer)
 496                         inet_putpeer(peer);
 497         } else {
 498                 int addrtype = ipv6_addr_type(&hdr->saddr);
 499
 500                 /* This check is security critical. */
 501                 if (addrtype == IPV6_ADDR_ANY ||
 502                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 503                         goto error;
 504                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 505                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 506                                     ICMPV6_NOT_NEIGHBOUR, 0);
 507                         goto error;
 508                 }
 509         }
 510
 511         mtu = ip6_dst_mtu_forward(dst);
 512         if (mtu < IPV6_MIN_MTU)
 513                 mtu = IPV6_MIN_MTU;
 514
 515         if (ip6_pkt_too_big(skb, mtu)) {
 516                 /* Again, force OUTPUT device used as source address */
 517                 skb->dev = dst->dev;
 518                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 519                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 520                                  IPSTATS_MIB_INTOOBIGERRORS);
 521                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 522                                  IPSTATS_MIB_FRAGFAILS);
 523                 kfree_skb(skb);
 524                 return -EMSGSIZE;
 525         }
 526
 527         if (skb_cow(skb, dst->dev->hard_header_len)) {
 528                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 529                                  IPSTATS_MIB_OUTDISCARDS);
 530                 goto drop;
 531         }
 532
 533         hdr = ipv6_hdr(skb);
 534
 535         /* Mangling hops number delayed to point after skb COW */
 536
 537         hdr->hop_limit--;
 538
 539         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 540                        net, NULL, skb, skb->dev, dst->dev,
 541                        ip6_forward_finish);
 542
 543 error:
 544         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 545 drop:
 546         kfree_skb(skb);
 547         return -EINVAL;
 548 }
 549
 550 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 551 {
 552         to->pkt_type = from->pkt_type;
 553         to->priority = from->priority;
 554         to->protocol = from->protocol;
 555         skb_dst_drop(to);
 556         skb_dst_set(to, dst_clone(skb_dst(from)));
 557         to->dev = from->dev;
 558         to->mark = from->mark;
 559
 560         skb_copy_hash(to, from);
 561
 562 #ifdef CONFIG_NET_SCHED
 563         to->tc_index = from->tc_index;
 564 #endif
 565         nf_copy(to, from);
 566         skb_copy_secmark(to, from);
 567 }
 568
 569 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 570                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 571 {
 572         struct sk_buff *frag;
 573         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 574         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 575                                 inet6_sk(skb->sk) : NULL;
 576         struct ipv6hdr *tmp_hdr;
 577         struct frag_hdr *fh;
 578         unsigned int mtu, hlen, left, len;
 579         int hroom, troom;
 580         __be32 frag_id;
 581         int ptr, offset = 0, err = 0;
 582         u8 *prevhdr, nexthdr = 0;
 583
 584         err = ip6_find_1stfragopt(skb, &prevhdr);
 585         if (err < 0)
 586                 goto fail;
 587         hlen = err;
 588         nexthdr = *prevhdr;
 589
 590         mtu = ip6_skb_dst_mtu(skb);
 591
 592         /* We must not fragment if the socket is set to force MTU discovery
 593          * or if the skb it not generated by a local socket.
 594          */
 595         if (unlikely(!skb->ignore_df && skb->len > mtu))
 596                 goto fail_toobig;
 597
 598         if (IP6CB(skb)->frag_max_size) {
 599                 if (IP6CB(skb)->frag_max_size > mtu)
 600                         goto fail_toobig;
 601
 602                 /* don't send fragments larger than what we received */
 603                 mtu = IP6CB(skb)->frag_max_size;
 604                 if (mtu < IPV6_MIN_MTU)
 605                         mtu = IPV6_MIN_MTU;
 606         }
 607
 608         if (np && np->frag_size < mtu) {
 609                 if (np->frag_size)
 610                         mtu = np->frag_size;
 611         }
 612         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 613                 goto fail_toobig;
 614         mtu -= hlen + sizeof(struct frag_hdr);
 615
 616         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 617                                     &ipv6_hdr(skb)->saddr);
 618
 619         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 620             (err = skb_checksum_help(skb)))
 621                 goto fail;
 622
 623         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 624         if (skb_has_frag_list(skb)) {
 625                 int first_len = skb_pagelen(skb);
 626                 struct sk_buff *frag2;
 627
 628                 if (first_len - hlen > mtu ||
 629                     ((first_len - hlen) & 7) ||
 630                     skb_cloned(skb) ||
 631                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 632                         goto slow_path;
 633
 634                 skb_walk_frags(skb, frag) {
 635                         /* Correct geometry. */
 636                         if (frag->len > mtu ||
 637                             ((frag->len & 7) && frag->next) ||
 638                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 639                                 goto slow_path_clean;
 640
 641                         /* Partially cloned skb? */
 642                         if (skb_shared(frag))
 643                                 goto slow_path_clean;
 644
 645                         BUG_ON(frag->sk);
 646                         if (skb->sk) {
 647                                 frag->sk = skb->sk;
 648                                 frag->destructor = sock_wfree;
 649                         }
 650                         skb->truesize -= frag->truesize;
 651                 }
 652
 653                 err = 0;
 654                 offset = 0;
 655                 /* BUILD HEADER */
 656
 657                 *prevhdr = NEXTHDR_FRAGMENT;
 658                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 659                 if (!tmp_hdr) {
 660                         err = -ENOMEM;
 661                         goto fail;
 662                 }
 663                 frag = skb_shinfo(skb)->frag_list;
 664                 skb_frag_list_init(skb);
 665
 666                 __skb_pull(skb, hlen);
 667                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
 668                 __skb_push(skb, hlen);
 669                 skb_reset_network_header(skb);
 670                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 671
 672                 fh->nexthdr = nexthdr;
 673                 fh->reserved = 0;
 674                 fh->frag_off = htons(IP6_MF);
 675                 fh->identification = frag_id;
 676
 677                 first_len = skb_pagelen(skb);
 678                 skb->data_len = first_len - skb_headlen(skb);
 679                 skb->len = first_len;
 680                 ipv6_hdr(skb)->payload_len = htons(first_len -
 681                                                    sizeof(struct ipv6hdr));
 682
 683                 dst_hold(&rt->dst);
 684
 685                 for (;;) {
 686                         /* Prepare header of the next frame,
 687                          * before previous one went down. */
 688                         if (frag) {
 689                                 frag->ip_summed = CHECKSUM_NONE;
 690                                 skb_reset_transport_header(frag);
 691                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
 692                                 __skb_push(frag, hlen);
 693                                 skb_reset_network_header(frag);
 694                                 memcpy(skb_network_header(frag), tmp_hdr,
 695                                        hlen);
 696                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 697                                 fh->nexthdr = nexthdr;
 698                                 fh->reserved = 0;
 699                                 fh->frag_off = htons(offset);
 700                                 if (frag->next)
 701                                         fh->frag_off |= htons(IP6_MF);
 702                                 fh->identification = frag_id;
 703                                 ipv6_hdr(frag)->payload_len =
 704                                                 htons(frag->len -
 705                                                       sizeof(struct ipv6hdr));
 706                                 ip6_copy_metadata(frag, skb);
 707                         }
 708
 709                         err = output(net, sk, skb);
 710                         if (!err)
 711                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 712                                               IPSTATS_MIB_FRAGCREATES);
 713
 714                         if (err || !frag)
 715                                 break;
 716
 717                         skb = frag;
 718                         frag = skb->next;
 719                         skb->next = NULL;
 720                 }
 721
 722                 kfree(tmp_hdr);
 723
 724                 if (err == 0) {
 725                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 726                                       IPSTATS_MIB_FRAGOKS);
 727                         ip6_rt_put(rt);
 728                         return 0;
 729                 }
 730
 731                 kfree_skb_list(frag);
 732
 733                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 734                               IPSTATS_MIB_FRAGFAILS);
 735                 ip6_rt_put(rt);
 736                 return err;
 737
 738 slow_path_clean:
 739                 skb_walk_frags(skb, frag2) {
 740                         if (frag2 == frag)
 741                                 break;
 742                         frag2->sk = NULL;
 743                         frag2->destructor = NULL;
 744                         skb->truesize += frag2->truesize;
 745                 }
 746         }
 747
 748 slow_path:
 749         left = skb->len - hlen;         /* Space per frame */
 750         ptr = hlen;                     /* Where to start from */
 751
 752         /*
 753          *      Fragment the datagram.
 754          */
 755
 756         troom = rt->dst.dev->needed_tailroom;
 757
 758         /*
 759          *      Keep copying data until we run out.
 760          */
 761         while (left > 0)        {
 762                 u8 *fragnexthdr_offset;
 763
 764                 len = left;
 765                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 766                 if (len > mtu)
 767                         len = mtu;
 768                 /* IF: we are not sending up to and including the packet end
 769                    then align the next start on an eight byte boundary */
 770                 if (len < left) {
 771                         len &= ~7;
 772                 }
 773
 774                 /* Allocate buffer */
 775                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 776                                  hroom + troom, GFP_ATOMIC);
 777                 if (!frag) {
 778                         err = -ENOMEM;
 779                         goto fail;
 780                 }
 781
 782                 /*
 783                  *      Set up data on packet
 784                  */
 785
 786                 ip6_copy_metadata(frag, skb);
 787                 skb_reserve(frag, hroom);
 788                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 789                 skb_reset_network_header(frag);
 790                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 791                 frag->transport_header = (frag->network_header + hlen +
 792                                           sizeof(struct frag_hdr));
 793
 794                 /*
 795                  *      Charge the memory for the fragment to any owner
 796                  *      it might possess
 797                  */
 798                 if (skb->sk)
 799                         skb_set_owner_w(frag, skb->sk);
 800
 801                 /*
 802                  *      Copy the packet header into the new buffer.
 803                  */
 804                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 805
 806                 fragnexthdr_offset = skb_network_header(frag);
 807                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 808                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 809
 810                 /*
 811                  *      Build fragment header.
 812                  */
 813                 fh->nexthdr = nexthdr;
 814                 fh->reserved = 0;
 815                 fh->identification = frag_id;
 816
 817                 /*
 818                  *      Copy a block of the IP datagram.
 819                  */
 820                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 821                                      len));
 822                 left -= len;
 823
 824                 fh->frag_off = htons(offset);
 825                 if (left > 0)
 826                         fh->frag_off |= htons(IP6_MF);
 827                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 828                                                     sizeof(struct ipv6hdr));
 829
 830                 ptr += len;
 831                 offset += len;
 832
 833                 /*
 834                  *      Put this fragment into the sending queue.
 835                  */
 836                 err = output(net, sk, frag);
 837                 if (err)
 838                         goto fail;
 839
 840                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 841                               IPSTATS_MIB_FRAGCREATES);
 842         }
 843         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 844                       IPSTATS_MIB_FRAGOKS);
 845         consume_skb(skb);
 846         return err;
 847
 848 fail_toobig:
 849         if (skb->sk && dst_allfrag(skb_dst(skb)))
 850                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 851
 852         skb->dev = skb_dst(skb)->dev;
 853         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 854         err = -EMSGSIZE;
 855
 856 fail:
 857         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 858                       IPSTATS_MIB_FRAGFAILS);
 859         kfree_skb(skb);
 860         return err;
 861 }
 862
 863 static inline int ip6_rt_check(const struct rt6key *rt_key,
 864                                const struct in6_addr *fl_addr,
 865                                const struct in6_addr *addr_cache)
 866 {
 867         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 868                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 869 }
 870
 871 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 872                                           struct dst_entry *dst,
 873                                           const struct flowi6 *fl6)
 874 {
 875         struct ipv6_pinfo *np = inet6_sk(sk);
 876         struct rt6_info *rt;
 877
 878         if (!dst)
 879                 goto out;
 880
 881         if (dst->ops->family != AF_INET6) {
 882                 dst_release(dst);
 883                 return NULL;
 884         }
 885
 886         rt = (struct rt6_info *)dst;
 887         /* Yes, checking route validity in not connected
 888          * case is not very simple. Take into account,
 889          * that we do not support routing by source, TOS,
 890          * and MSG_DONTROUTE            --ANK (980726)
 891          *
 892          * 1. ip6_rt_check(): If route was host route,
 893          *    check that cached destination is current.
 894          *    If it is network route, we still may
 895          *    check its validity using saved pointer
 896          *    to the last used address: daddr_cache.
 897          *    We do not want to save whole address now,
 898          *    (because main consumer of this service
 899          *    is tcp, which has not this problem),
 900          *    so that the last trick works only on connected
 901          *    sockets.
 902          * 2. oif also should be the same.
 903          */
 904         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 905 #ifdef CONFIG_IPV6_SUBTREES
 906             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 907 #endif
 908            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 909               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 910                 dst_release(dst);
 911                 dst = NULL;
 912         }
 913
 914 out:
 915         return dst;
 916 }
 917
 918 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 919                                struct dst_entry **dst, struct flowi6 *fl6)
 920 {
 921 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 922         struct neighbour *n;
 923         struct rt6_info *rt;
 924 #endif
 925         int err;
 926         int flags = 0;
 927
 928         /* The correct way to handle this would be to do
 929          * ip6_route_get_saddr, and then ip6_route_output; however,
 930          * the route-specific preferred source forces the
 931          * ip6_route_output call _before_ ip6_route_get_saddr.
 932          *
 933          * In source specific routing (no src=any default route),
 934          * ip6_route_output will fail given src=any saddr, though, so
 935          * that's why we try it again later.
 936          */
 937         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 938                 struct rt6_info *rt;
 939                 bool had_dst = *dst != NULL;
 940
 941                 if (!had_dst)
 942                         *dst = ip6_route_output(net, sk, fl6);
 943                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 944                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 945                                           sk ? inet6_sk(sk)->srcprefs : 0,
 946                                           &fl6->saddr);
 947                 if (err)
 948                         goto out_err_release;
 949
 950                 /* If we had an erroneous initial result, pretend it
 951                  * never existed and let the SA-enabled version take
 952                  * over.
 953                  */
 954                 if (!had_dst && (*dst)->error) {
 955                         dst_release(*dst);
 956                         *dst = NULL;
 957                 }
 958
 959                 if (fl6->flowi6_oif)
 960                         flags |= RT6_LOOKUP_F_IFACE;
 961         }
 962
 963         if (!*dst)
 964                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 965
 966         err = (*dst)->error;
 967         if (err)
 968                 goto out_err_release;
 969
 970 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 971         /*
 972          * Here if the dst entry we've looked up
 973          * has a neighbour entry that is in the INCOMPLETE
 974          * state and the src address from the flow is
 975          * marked as OPTIMISTIC, we release the found
 976          * dst entry and replace it instead with the
 977          * dst entry of the nexthop router
 978          */
 979         rt = (struct rt6_info *) *dst;
 980         rcu_read_lock_bh();
 981         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 982                                       rt6_nexthop(rt, &fl6->daddr));
 983         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 984         rcu_read_unlock_bh();
 985
 986         if (err) {
 987                 struct inet6_ifaddr *ifp;
 988                 struct flowi6 fl_gw6;
 989                 int redirect;
 990
 991                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 992                                       (*dst)->dev, 1);
 993
 994                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 995                 if (ifp)
 996                         in6_ifa_put(ifp);
 997
 998                 if (redirect) {
 999                         /*
1000                          * We need to get the dst entry for the
1001                          * default router instead
1002                          */
1003                         dst_release(*dst);
1004                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1005                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1006                         *dst = ip6_route_output(net, sk, &fl_gw6);
1007                         err = (*dst)->error;
1008                         if (err)
1009                                 goto out_err_release;
1010                 }
1011         }
1012 #endif
1013         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1014             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1015                 err = -EAFNOSUPPORT;
1016                 goto out_err_release;
1017         }
1018
1019         return 0;
1020
1021 out_err_release:
1022         if (err == -ENETUNREACH)
1023                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1024         dst_release(*dst);
1025         *dst = NULL;
1026         return err;
1027 }
1028
1029 /**
1030  *      ip6_dst_lookup - perform route lookup on flow
1031  *      @sk: socket which provides route info
1032  *      @dst: pointer to dst_entry * for result
1033  *      @fl6: flow to lookup
1034  *
1035  *      This function performs a route lookup on the given flow.
1036  *
1037  *      It returns zero on success, or a standard errno code on error.
1038  */
1039 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1040                    struct flowi6 *fl6)
1041 {
1042         *dst = NULL;
1043         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1044 }
1045 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1046
1047 /**
1048  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1049  *      @sk: socket which provides route info
1050  *      @fl6: flow to lookup
1051  *      @final_dst: final destination address for ipsec lookup
1052  *
1053  *      This function performs a route lookup on the given flow.
1054  *
1055  *      It returns a valid dst pointer on success, or a pointer encoded
1056  *      error code.
1057  */
1058 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1059                                       const struct in6_addr *final_dst)
1060 {
1061         struct dst_entry *dst = NULL;
1062         int err;
1063
1064         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1065         if (err)
1066                 return ERR_PTR(err);
1067         if (final_dst)
1068                 fl6->daddr = *final_dst;
1069         if (!fl6->flowi6_oif)
1070                 fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
1071
1072         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1073 }
1074 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1075
1076 /**
1077  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1078  *      @sk: socket which provides the dst cache and route info
1079  *      @fl6: flow to lookup
1080  *      @final_dst: final destination address for ipsec lookup
1081  *
1082  *      This function performs a route lookup on the given flow with the
1083  *      possibility of using the cached route in the socket if it is valid.
1084  *      It will take the socket dst lock when operating on the dst cache.
1085  *      As a result, this function can only be used in process context.
1086  *
1087  *      It returns a valid dst pointer on success, or a pointer encoded
1088  *      error code.
1089  */
1090 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1091                                          const struct in6_addr *final_dst)
1092 {
1093         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1094
1095         dst = ip6_sk_dst_check(sk, dst, fl6);
1096         if (!dst)
1097                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1098
1099         return dst;
1100 }
1101 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1102
1103 static inline int ip6_ufo_append_data(struct sock *sk,
1104                         struct sk_buff_head *queue,
1105                         int getfrag(void *from, char *to, int offset, int len,
1106                         int odd, struct sk_buff *skb),
1107                         void *from, int length, int hh_len, int fragheaderlen,
1108                         int exthdrlen, int transhdrlen, int mtu,
1109                         unsigned int flags, const struct flowi6 *fl6)
1110
1111 {
1112         struct sk_buff *skb;
1113         int err;
1114
1115         /* There is support for UDP large send offload by network
1116          * device, so create one single skb packet containing complete
1117          * udp datagram
1118          */
1119         skb = skb_peek_tail(queue);
1120         if (!skb) {
1121                 skb = sock_alloc_send_skb(sk,
1122                         hh_len + fragheaderlen + transhdrlen + 20,
1123                         (flags & MSG_DONTWAIT), &err);
1124                 if (!skb)
1125                         return err;
1126
1127                 /* reserve space for Hardware header */
1128                 skb_reserve(skb, hh_len);
1129
1130                 /* create space for UDP/IP header */
1131                 skb_put(skb, fragheaderlen + transhdrlen);
1132
1133                 /* initialize network header pointer */
1134                 skb_set_network_header(skb, exthdrlen);
1135
1136                 /* initialize protocol header pointer */
1137                 skb->transport_header = skb->network_header + fragheaderlen;
1138
1139                 skb->protocol = htons(ETH_P_IPV6);
1140                 skb->csum = 0;
1141
1142                 __skb_queue_tail(queue, skb);
1143         } else if (skb_is_gso(skb)) {
1144                 goto append;
1145         }
1146
1147         skb->ip_summed = CHECKSUM_PARTIAL;
1148         /* Specify the length of each IPv6 datagram fragment.
1149          * It has to be a multiple of 8.
1150          */
1151         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1152                                      sizeof(struct frag_hdr)) & ~7;
1153         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1154         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1155                                                          &fl6->daddr,
1156                                                          &fl6->saddr);
1157
1158 append:
1159         return skb_append_datato_frags(sk, skb, getfrag, from,
1160                                        (length - transhdrlen));
1161 }
1162
1163 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1164                                                gfp_t gfp)
1165 {
1166         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1167 }
1168
1169 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1170                                                 gfp_t gfp)
1171 {
1172         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1173 }
1174
1175 static void ip6_append_data_mtu(unsigned int *mtu,
1176                                 int *maxfraglen,
1177                                 unsigned int fragheaderlen,
1178                                 struct sk_buff *skb,
1179                                 struct rt6_info *rt,
1180                                 unsigned int orig_mtu)
1181 {
1182         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1183                 if (!skb) {
1184                         /* first fragment, reserve header_len */
1185                         *mtu = orig_mtu - rt->dst.header_len;
1186
1187                 } else {
1188                         /*
1189                          * this fragment is not first, the headers
1190                          * space is regarded as data space.
1191                          */
1192                         *mtu = orig_mtu;
1193                 }
1194                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1195                               + fragheaderlen - sizeof(struct frag_hdr);
1196         }
1197 }
1198
1199 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1200                           struct inet6_cork *v6_cork,
1201                           int hlimit, int tclass, struct ipv6_txoptions *opt,
1202                           struct rt6_info *rt, struct flowi6 *fl6)
1203 {
1204         struct ipv6_pinfo *np = inet6_sk(sk);
1205         unsigned int mtu;
1206
1207         /*
1208          * setup for corking
1209          */
1210         if (opt) {
1211                 if (WARN_ON(v6_cork->opt))
1212                         return -EINVAL;
1213
1214                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1215                 if (unlikely(!v6_cork->opt))
1216                         return -ENOBUFS;
1217
1218                 v6_cork->opt->tot_len = sizeof(*opt);
1219                 v6_cork->opt->opt_flen = opt->opt_flen;
1220                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1221
1222                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1223                                                     sk->sk_allocation);
1224                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1225                         return -ENOBUFS;
1226
1227                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1228                                                     sk->sk_allocation);
1229                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1230                         return -ENOBUFS;
1231
1232                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1233                                                    sk->sk_allocation);
1234                 if (opt->hopopt && !v6_cork->opt->hopopt)
1235                         return -ENOBUFS;
1236
1237                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1238                                                     sk->sk_allocation);
1239                 if (opt->srcrt && !v6_cork->opt->srcrt)
1240                         return -ENOBUFS;
1241
1242                 /* need source address above miyazawa*/
1243         }
1244         dst_hold(&rt->dst);
1245         cork->base.dst = &rt->dst;
1246         cork->fl.u.ip6 = *fl6;
1247         v6_cork->hop_limit = hlimit;
1248         v6_cork->tclass = tclass;
1249         if (rt->dst.flags & DST_XFRM_TUNNEL)
1250                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1251                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1252         else
1253                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1254                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
1255         if (np->frag_size < mtu) {
1256                 if (np->frag_size)
1257                         mtu = np->frag_size;
1258         }
1259         if (mtu < IPV6_MIN_MTU)
1260                 return -EINVAL;
1261         cork->base.fragsize = mtu;
1262         if (dst_allfrag(rt->dst.path))
1263                 cork->base.flags |= IPCORK_ALLFRAG;
1264         cork->base.length = 0;
1265
1266         return 0;
1267 }
1268
1269 static int __ip6_append_data(struct sock *sk,
1270                              struct flowi6 *fl6,
1271                              struct sk_buff_head *queue,
1272                              struct inet_cork *cork,
1273                              struct inet6_cork *v6_cork,
1274                              struct page_frag *pfrag,
1275                              int getfrag(void *from, char *to, int offset,
1276                                          int len, int odd, struct sk_buff *skb),
1277                              void *from, int length, int transhdrlen,
1278                              unsigned int flags, int dontfrag)
1279 {
1280         struct sk_buff *skb, *skb_prev = NULL;
1281         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1282         int exthdrlen = 0;
1283         int dst_exthdrlen = 0;
1284         int hh_len;
1285         int copy;
1286         int err;
1287         int offset = 0;
1288         __u8 tx_flags = 0;
1289         u32 tskey = 0;
1290         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1291         struct ipv6_txoptions *opt = v6_cork->opt;
1292         int csummode = CHECKSUM_NONE;
1293         unsigned int maxnonfragsize, headersize;
1294
1295         skb = skb_peek_tail(queue);
1296         if (!skb) {
1297                 exthdrlen = opt ? opt->opt_flen : 0;
1298                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1299         }
1300
1301         mtu = cork->fragsize;
1302         orig_mtu = mtu;
1303
1304         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1305
1306         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1307                         (opt ? opt->opt_nflen : 0);
1308         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1309                      sizeof(struct frag_hdr);
1310
1311         headersize = sizeof(struct ipv6hdr) +
1312                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1313                      (dst_allfrag(&rt->dst) ?
1314                       sizeof(struct frag_hdr) : 0) +
1315                      rt->rt6i_nfheader_len;
1316
1317         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1318          * the first fragment
1319          */
1320         if (headersize + transhdrlen > mtu)
1321                 goto emsgsize;
1322
1323         if (cork->length + length > mtu - headersize && dontfrag &&
1324             (sk->sk_protocol == IPPROTO_UDP ||
1325              sk->sk_protocol == IPPROTO_RAW)) {
1326                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1327                                 sizeof(struct ipv6hdr));
1328                 goto emsgsize;
1329         }
1330
1331         if (ip6_sk_ignore_df(sk))
1332                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1333         else
1334                 maxnonfragsize = mtu;
1335
1336         if (cork->length + length > maxnonfragsize - headersize) {
1337 emsgsize:
1338                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1339                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1340                 return -EMSGSIZE;
1341         }
1342
1343         /* CHECKSUM_PARTIAL only with no extension headers and when
1344          * we are not going to fragment
1345          */
1346         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1347             headersize == sizeof(struct ipv6hdr) &&
1348             length < mtu - headersize &&
1349             !(flags & MSG_MORE) &&
1350             rt->dst.dev->features & NETIF_F_V6_CSUM)
1351                 csummode = CHECKSUM_PARTIAL;
1352
1353         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1354                 sock_tx_timestamp(sk, &tx_flags);
1355                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1356                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1357                         tskey = sk->sk_tskey++;
1358         }
1359
1360         /*
1361          * Let's try using as much space as possible.
1362          * Use MTU if total length of the message fits into the MTU.
1363          * Otherwise, we need to reserve fragment header and
1364          * fragment alignment (= 8-15 octects, in total).
1365          *
1366          * Note that we may need to "move" the data from the tail of
1367          * of the buffer to the new fragment when we split
1368          * the message.
1369          *
1370          * FIXME: It may be fragmented into multiple chunks
1371          *        at once if non-fragmentable extension headers
1372          *        are too large.
1373          * --yoshfuji
1374          */
1375
1376         cork->length += length;
1377         if ((skb && skb_is_gso(skb)) ||
1378             (((length + (skb ? skb->len : headersize)) > mtu) &&
1379             (skb_queue_len(queue) <= 1) &&
1380             (sk->sk_protocol == IPPROTO_UDP) &&
1381             (rt->dst.dev->features & NETIF_F_UFO) &&
1382             (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk))) {
1383                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1384                                           hh_len, fragheaderlen, exthdrlen,
1385                                           transhdrlen, mtu, flags, fl6);
1386                 if (err)
1387                         goto error;
1388                 return 0;
1389         }
1390
1391         if (!skb)
1392                 goto alloc_new_skb;
1393
1394         while (length > 0) {
1395                 /* Check if the remaining data fits into current packet. */
1396                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1397                 if (copy < length)
1398                         copy = maxfraglen - skb->len;
1399
1400                 if (copy <= 0) {
1401                         char *data;
1402                         unsigned int datalen;
1403                         unsigned int fraglen;
1404                         unsigned int fraggap;
1405                         unsigned int alloclen;
1406 alloc_new_skb:
1407                         /* There's no room in the current skb */
1408                         if (skb)
1409                                 fraggap = skb->len - maxfraglen;
1410                         else
1411                                 fraggap = 0;
1412                         /* update mtu and maxfraglen if necessary */
1413                         if (!skb || !skb_prev)
1414                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1415                                                     fragheaderlen, skb, rt,
1416                                                     orig_mtu);
1417
1418                         skb_prev = skb;
1419
1420                         /*
1421                          * If remaining data exceeds the mtu,
1422                          * we know we need more fragment(s).
1423                          */
1424                         datalen = length + fraggap;
1425
1426                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1427                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1428                         if ((flags & MSG_MORE) &&
1429                             !(rt->dst.dev->features&NETIF_F_SG))
1430                                 alloclen = mtu;
1431                         else
1432                                 alloclen = datalen + fragheaderlen;
1433
1434                         alloclen += dst_exthdrlen;
1435
1436                         if (datalen != length + fraggap) {
1437                                 /*
1438                                  * this is not the last fragment, the trailer
1439                                  * space is regarded as data space.
1440                                  */
1441                                 datalen += rt->dst.trailer_len;
1442                         }
1443
1444                         alloclen += rt->dst.trailer_len;
1445                         fraglen = datalen + fragheaderlen;
1446
1447                         /*
1448                          * We just reserve space for fragment header.
1449                          * Note: this may be overallocation if the message
1450                          * (without MSG_MORE) fits into the MTU.
1451                          */
1452                         alloclen += sizeof(struct frag_hdr);
1453
1454                         copy = datalen - transhdrlen - fraggap;
1455                         if (copy < 0) {
1456                                 err = -EINVAL;
1457                                 goto error;
1458                         }
1459                         if (transhdrlen) {
1460                                 skb = sock_alloc_send_skb(sk,
1461                                                 alloclen + hh_len,
1462                                                 (flags & MSG_DONTWAIT), &err);
1463                         } else {
1464                                 skb = NULL;
1465                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1466                                     2 * sk->sk_sndbuf)
1467                                         skb = sock_wmalloc(sk,
1468                                                            alloclen + hh_len, 1,
1469                                                            sk->sk_allocation);
1470                                 if (unlikely(!skb))
1471                                         err = -ENOBUFS;
1472                         }
1473                         if (!skb)
1474                                 goto error;
1475                         /*
1476                          *      Fill in the control structures
1477                          */
1478                         skb->protocol = htons(ETH_P_IPV6);
1479                         skb->ip_summed = csummode;
1480                         skb->csum = 0;
1481                         /* reserve for fragmentation and ipsec header */
1482                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1483                                     dst_exthdrlen);
1484
1485                         /* Only the initial fragment is time stamped */
1486                         skb_shinfo(skb)->tx_flags = tx_flags;
1487                         tx_flags = 0;
1488                         skb_shinfo(skb)->tskey = tskey;
1489                         tskey = 0;
1490
1491                         /*
1492                          *      Find where to start putting bytes
1493                          */
1494                         data = skb_put(skb, fraglen);
1495                         skb_set_network_header(skb, exthdrlen);
1496                         data += fragheaderlen;
1497                         skb->transport_header = (skb->network_header +
1498                                                  fragheaderlen);
1499                         if (fraggap) {
1500                                 skb->csum = skb_copy_and_csum_bits(
1501                                         skb_prev, maxfraglen,
1502                                         data + transhdrlen, fraggap, 0);
1503                                 skb_prev->csum = csum_sub(skb_prev->csum,
1504                                                           skb->csum);
1505                                 data += fraggap;
1506                                 pskb_trim_unique(skb_prev, maxfraglen);
1507                         }
1508                         if (copy > 0 &&
1509                             getfrag(from, data + transhdrlen, offset,
1510                                     copy, fraggap, skb) < 0) {
1511                                 err = -EFAULT;
1512                                 kfree_skb(skb);
1513                                 goto error;
1514                         }
1515
1516                         offset += copy;
1517                         length -= datalen - fraggap;
1518                         transhdrlen = 0;
1519                         exthdrlen = 0;
1520                         dst_exthdrlen = 0;
1521
1522                         /*
1523                          * Put the packet on the pending queue
1524                          */
1525                         __skb_queue_tail(queue, skb);
1526                         continue;
1527                 }
1528
1529                 if (copy > length)
1530                         copy = length;
1531
1532                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1533                     skb_tailroom(skb) >= copy) {
1534                         unsigned int off;
1535
1536                         off = skb->len;
1537                         if (getfrag(from, skb_put(skb, copy),
1538                                                 offset, copy, off, skb) < 0) {
1539                                 __skb_trim(skb, off);
1540                                 err = -EFAULT;
1541                                 goto error;
1542                         }
1543                 } else {
1544                         int i = skb_shinfo(skb)->nr_frags;
1545
1546                         err = -ENOMEM;
1547                         if (!sk_page_frag_refill(sk, pfrag))
1548                                 goto error;
1549
1550                         if (!skb_can_coalesce(skb, i, pfrag->page,
1551                                               pfrag->offset)) {
1552                                 err = -EMSGSIZE;
1553                                 if (i == MAX_SKB_FRAGS)
1554                                         goto error;
1555
1556                                 __skb_fill_page_desc(skb, i, pfrag->page,
1557                                                      pfrag->offset, 0);
1558                                 skb_shinfo(skb)->nr_frags = ++i;
1559                                 get_page(pfrag->page);
1560                         }
1561                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1562                         if (getfrag(from,
1563                                     page_address(pfrag->page) + pfrag->offset,
1564                                     offset, copy, skb->len, skb) < 0)
1565                                 goto error_efault;
1566
1567                         pfrag->offset += copy;
1568                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1569                         skb->len += copy;
1570                         skb->data_len += copy;
1571                         skb->truesize += copy;
1572                         atomic_add(copy, &sk->sk_wmem_alloc);
1573                 }
1574                 offset += copy;
1575                 length -= copy;
1576         }
1577
1578         return 0;
1579
1580 error_efault:
1581         err = -EFAULT;
1582 error:
1583         cork->length -= length;
1584         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1585         return err;
1586 }
1587
1588 int ip6_append_data(struct sock *sk,
1589                     int getfrag(void *from, char *to, int offset, int len,
1590                                 int odd, struct sk_buff *skb),
1591                     void *from, int length, int transhdrlen, int hlimit,
1592                     int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1593                     struct rt6_info *rt, unsigned int flags, int dontfrag)
1594 {
1595         struct inet_sock *inet = inet_sk(sk);
1596         struct ipv6_pinfo *np = inet6_sk(sk);
1597         int exthdrlen;
1598         int err;
1599
1600         if (flags&MSG_PROBE)
1601                 return 0;
1602         if (skb_queue_empty(&sk->sk_write_queue)) {
1603                 /*
1604                  * setup for corking
1605                  */
1606                 err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1607                                      tclass, opt, rt, fl6);
1608                 if (err)
1609                         return err;
1610
1611                 exthdrlen = (opt ? opt->opt_flen : 0);
1612                 length += exthdrlen;
1613                 transhdrlen += exthdrlen;
1614         } else {
1615                 fl6 = &inet->cork.fl.u.ip6;
1616                 transhdrlen = 0;
1617         }
1618
1619         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1620                                  &np->cork, sk_page_frag(sk), getfrag,
1621                                  from, length, transhdrlen, flags, dontfrag);
1622 }
1623 EXPORT_SYMBOL_GPL(ip6_append_data);
1624
1625 static void ip6_cork_release(struct inet_cork_full *cork,
1626                              struct inet6_cork *v6_cork)
1627 {
1628         if (v6_cork->opt) {
1629                 kfree(v6_cork->opt->dst0opt);
1630                 kfree(v6_cork->opt->dst1opt);
1631                 kfree(v6_cork->opt->hopopt);
1632                 kfree(v6_cork->opt->srcrt);
1633                 kfree(v6_cork->opt);
1634                 v6_cork->opt = NULL;
1635         }
1636
1637         if (cork->base.dst) {
1638                 dst_release(cork->base.dst);
1639                 cork->base.dst = NULL;
1640                 cork->base.flags &= ~IPCORK_ALLFRAG;
1641         }
1642         memset(&cork->fl, 0, sizeof(cork->fl));
1643 }
1644
1645 struct sk_buff *__ip6_make_skb(struct sock *sk,
1646                                struct sk_buff_head *queue,
1647                                struct inet_cork_full *cork,
1648                                struct inet6_cork *v6_cork)
1649 {
1650         struct sk_buff *skb, *tmp_skb;
1651         struct sk_buff **tail_skb;
1652         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1653         struct ipv6_pinfo *np = inet6_sk(sk);
1654         struct net *net = sock_net(sk);
1655         struct ipv6hdr *hdr;
1656         struct ipv6_txoptions *opt = v6_cork->opt;
1657         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1658         struct flowi6 *fl6 = &cork->fl.u.ip6;
1659         unsigned char proto = fl6->flowi6_proto;
1660
1661         skb = __skb_dequeue(queue);
1662         if (!skb)
1663                 goto out;
1664         tail_skb = &(skb_shinfo(skb)->frag_list);
1665
1666         /* move skb->data to ip header from ext header */
1667         if (skb->data < skb_network_header(skb))
1668                 __skb_pull(skb, skb_network_offset(skb));
1669         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1670                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1671                 *tail_skb = tmp_skb;
1672                 tail_skb = &(tmp_skb->next);
1673                 skb->len += tmp_skb->len;
1674                 skb->data_len += tmp_skb->len;
1675                 skb->truesize += tmp_skb->truesize;
1676                 tmp_skb->destructor = NULL;
1677                 tmp_skb->sk = NULL;
1678         }
1679
1680         /* Allow local fragmentation. */
1681         skb->ignore_df = ip6_sk_ignore_df(sk);
1682
1683         *final_dst = fl6->daddr;
1684         __skb_pull(skb, skb_network_header_len(skb));
1685         if (opt && opt->opt_flen)
1686                 ipv6_push_frag_opts(skb, opt, &proto);
1687         if (opt && opt->opt_nflen)
1688                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1689
1690         skb_push(skb, sizeof(struct ipv6hdr));
1691         skb_reset_network_header(skb);
1692         hdr = ipv6_hdr(skb);
1693
1694         ip6_flow_hdr(hdr, v6_cork->tclass,
1695                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1696                                         ip6_autoflowlabel(net, np), fl6));
1697         hdr->hop_limit = v6_cork->hop_limit;
1698         hdr->nexthdr = proto;
1699         hdr->saddr = fl6->saddr;
1700         hdr->daddr = *final_dst;
1701
1702         skb->priority = sk->sk_priority;
1703         skb->mark = sk->sk_mark;
1704
1705         skb_dst_set(skb, dst_clone(&rt->dst));
1706         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1707         if (proto == IPPROTO_ICMPV6) {
1708                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1709
1710                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1711                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1712         }
1713
1714         ip6_cork_release(cork, v6_cork);
1715 out:
1716         return skb;
1717 }
1718
1719 int ip6_send_skb(struct sk_buff *skb)
1720 {
1721         struct net *net = sock_net(skb->sk);
1722         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1723         int err;
1724
1725         err = ip6_local_out(net, skb->sk, skb);
1726         if (err) {
1727                 if (err > 0)
1728                         err = net_xmit_errno(err);
1729                 if (err)
1730                         IP6_INC_STATS(net, rt->rt6i_idev,
1731                                       IPSTATS_MIB_OUTDISCARDS);
1732         }
1733
1734         return err;
1735 }
1736
1737 int ip6_push_pending_frames(struct sock *sk)
1738 {
1739         struct sk_buff *skb;
1740
1741         skb = ip6_finish_skb(sk);
1742         if (!skb)
1743                 return 0;
1744
1745         return ip6_send_skb(skb);
1746 }
1747 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1748
1749 static void __ip6_flush_pending_frames(struct sock *sk,
1750                                        struct sk_buff_head *queue,
1751                                        struct inet_cork_full *cork,
1752                                        struct inet6_cork *v6_cork)
1753 {
1754         struct sk_buff *skb;
1755
1756         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1757                 if (skb_dst(skb))
1758                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1759                                       IPSTATS_MIB_OUTDISCARDS);
1760                 kfree_skb(skb);
1761         }
1762
1763         ip6_cork_release(cork, v6_cork);
1764 }
1765
1766 void ip6_flush_pending_frames(struct sock *sk)
1767 {
1768         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1769                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1770 }
1771 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1772
1773 struct sk_buff *ip6_make_skb(struct sock *sk,
1774                              int getfrag(void *from, char *to, int offset,
1775                                          int len, int odd, struct sk_buff *skb),
1776                              void *from, int length, int transhdrlen,
1777                              int hlimit, int tclass,
1778                              struct ipv6_txoptions *opt, struct flowi6 *fl6,
1779                              struct rt6_info *rt, unsigned int flags,
1780                              int dontfrag)
1781 {
1782         struct inet_cork_full cork;
1783         struct inet6_cork v6_cork;
1784         struct sk_buff_head queue;
1785         int exthdrlen = (opt ? opt->opt_flen : 0);
1786         int err;
1787
1788         if (flags & MSG_PROBE)
1789                 return NULL;
1790
1791         __skb_queue_head_init(&queue);
1792
1793         cork.base.flags = 0;
1794         cork.base.addr = 0;
1795         cork.base.opt = NULL;
1796         cork.base.dst = NULL;
1797         v6_cork.opt = NULL;
1798         err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1799         if (err) {
1800                 ip6_cork_release(&cork, &v6_cork);
1801                 return ERR_PTR(err);
1802         }
1803
1804         if (dontfrag < 0)
1805                 dontfrag = inet6_sk(sk)->dontfrag;
1806
1807         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1808                                 &current->task_frag, getfrag, from,
1809                                 length + exthdrlen, transhdrlen + exthdrlen,
1810                                 flags, dontfrag);
1811         if (err) {
1812                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1813                 return ERR_PTR(err);
1814         }
1815
1816         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1817 }