net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 static int ip6_finish_output2(struct sk_buff *skb)
  60 {
  61         struct dst_entry *dst = skb_dst(skb);
  62         struct net_device *dev = dst->dev;
  63         struct neighbour *neigh;
  64         struct in6_addr *nexthop;
  65         int ret;
  66
  67         skb->protocol = htons(ETH_P_IPV6);
  68         skb->dev = dev;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
  74                     ((mroute6_socket(dev_net(dev), skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(dev_net(dev), idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
  97                                 skb->len);
  98
  99                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 100                     IPV6_ADDR_SCOPE_NODELOCAL &&
 101                     !(dev->flags & IFF_LOOPBACK)) {
 102                         kfree_skb(skb);
 103                         return 0;
 104                 }
 105         }
 106
 107         rcu_read_lock_bh();
 108         nexthop = rt6_nexthop((struct rt6_info *)dst);
 109         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 110         if (unlikely(!neigh))
 111                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 112         if (!IS_ERR(neigh)) {
 113                 ret = dst_neigh_output(dst, neigh, skb);
 114                 rcu_read_unlock_bh();
 115                 return ret;
 116         }
 117         rcu_read_unlock_bh();
 118
 119         IP6_INC_STATS(dev_net(dst->dev),
 120                       ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 121         kfree_skb(skb);
 122         return -EINVAL;
 123 }
 124
 125 static int ip6_finish_output(struct sk_buff *skb)
 126 {
 127         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 128             dst_allfrag(skb_dst(skb)) ||
 129             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 130                 return ip6_fragment(skb, ip6_finish_output2);
 131         else
 132                 return ip6_finish_output2(skb);
 133 }
 134
 135 int ip6_output(struct sk_buff *skb)
 136 {
 137         struct net_device *dev = skb_dst(skb)->dev;
 138         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 139         if (unlikely(idev->cnf.disable_ipv6)) {
 140                 IP6_INC_STATS(dev_net(dev), idev,
 141                               IPSTATS_MIB_OUTDISCARDS);
 142                 kfree_skb(skb);
 143                 return 0;
 144         }
 145
 146         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 147                             ip6_finish_output,
 148                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 149 }
 150
 151 /*
 152  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 153  */
 154
 155 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 156              struct ipv6_txoptions *opt, int tclass)
 157 {
 158         struct net *net = sock_net(sk);
 159         struct ipv6_pinfo *np = inet6_sk(sk);
 160         struct in6_addr *first_hop = &fl6->daddr;
 161         struct dst_entry *dst = skb_dst(skb);
 162         struct ipv6hdr *hdr;
 163         u8  proto = fl6->flowi6_proto;
 164         int seg_len = skb->len;
 165         int hlimit = -1;
 166         u32 mtu;
 167
 168         if (opt) {
 169                 unsigned int head_room;
 170
 171                 /* First: exthdrs may take lots of space (~8K for now)
 172                    MAX_HEADER is not enough.
 173                  */
 174                 head_room = opt->opt_nflen + opt->opt_flen;
 175                 seg_len += head_room;
 176                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 177
 178                 if (skb_headroom(skb) < head_room) {
 179                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 180                         if (skb2 == NULL) {
 181                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 182                                               IPSTATS_MIB_OUTDISCARDS);
 183                                 kfree_skb(skb);
 184                                 return -ENOBUFS;
 185                         }
 186                         consume_skb(skb);
 187                         skb = skb2;
 188                         skb_set_owner_w(skb, sk);
 189                 }
 190                 if (opt->opt_flen)
 191                         ipv6_push_frag_opts(skb, opt, &proto);
 192                 if (opt->opt_nflen)
 193                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 194         }
 195
 196         skb_push(skb, sizeof(struct ipv6hdr));
 197         skb_reset_network_header(skb);
 198         hdr = ipv6_hdr(skb);
 199
 200         /*
 201          *      Fill in the IPv6 header
 202          */
 203         if (np)
 204                 hlimit = np->hop_limit;
 205         if (hlimit < 0)
 206                 hlimit = ip6_dst_hoplimit(dst);
 207
 208         ip6_flow_hdr(hdr, tclass, fl6->flowlabel);
 209
 210         hdr->payload_len = htons(seg_len);
 211         hdr->nexthdr = proto;
 212         hdr->hop_limit = hlimit;
 213
 214         hdr->saddr = fl6->saddr;
 215         hdr->daddr = *first_hop;
 216
 217         skb->protocol = htons(ETH_P_IPV6);
 218         skb->priority = sk->sk_priority;
 219         skb->mark = sk->sk_mark;
 220
 221         mtu = dst_mtu(dst);
 222         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 223                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 224                               IPSTATS_MIB_OUT, skb->len);
 225                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 226                                dst->dev, dst_output);
 227         }
 228
 229         skb->dev = dst->dev;
 230         ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
 231         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 232         kfree_skb(skb);
 233         return -EMSGSIZE;
 234 }
 235
 236 EXPORT_SYMBOL(ip6_xmit);
 237
 238 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 239 {
 240         struct ip6_ra_chain *ra;
 241         struct sock *last = NULL;
 242
 243         read_lock(&ip6_ra_lock);
 244         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 245                 struct sock *sk = ra->sk;
 246                 if (sk && ra->sel == sel &&
 247                     (!sk->sk_bound_dev_if ||
 248                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 249                         if (last) {
 250                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 251                                 if (skb2)
 252                                         rawv6_rcv(last, skb2);
 253                         }
 254                         last = sk;
 255                 }
 256         }
 257
 258         if (last) {
 259                 rawv6_rcv(last, skb);
 260                 read_unlock(&ip6_ra_lock);
 261                 return 1;
 262         }
 263         read_unlock(&ip6_ra_lock);
 264         return 0;
 265 }
 266
 267 static int ip6_forward_proxy_check(struct sk_buff *skb)
 268 {
 269         struct ipv6hdr *hdr = ipv6_hdr(skb);
 270         u8 nexthdr = hdr->nexthdr;
 271         __be16 frag_off;
 272         int offset;
 273
 274         if (ipv6_ext_hdr(nexthdr)) {
 275                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 276                 if (offset < 0)
 277                         return 0;
 278         } else
 279                 offset = sizeof(struct ipv6hdr);
 280
 281         if (nexthdr == IPPROTO_ICMPV6) {
 282                 struct icmp6hdr *icmp6;
 283
 284                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 285                                          offset + 1 - skb->data)))
 286                         return 0;
 287
 288                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 289
 290                 switch (icmp6->icmp6_type) {
 291                 case NDISC_ROUTER_SOLICITATION:
 292                 case NDISC_ROUTER_ADVERTISEMENT:
 293                 case NDISC_NEIGHBOUR_SOLICITATION:
 294                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 295                 case NDISC_REDIRECT:
 296                         /* For reaction involving unicast neighbor discovery
 297                          * message destined to the proxied address, pass it to
 298                          * input function.
 299                          */
 300                         return 1;
 301                 default:
 302                         break;
 303                 }
 304         }
 305
 306         /*
 307          * The proxying router can't forward traffic sent to a link-local
 308          * address, so signal the sender and discard the packet. This
 309          * behavior is clarified by the MIPv6 specification.
 310          */
 311         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 312                 dst_link_failure(skb);
 313                 return -1;
 314         }
 315
 316         return 0;
 317 }
 318
 319 static inline int ip6_forward_finish(struct sk_buff *skb)
 320 {
 321         return dst_output(skb);
 322 }
 323
 324 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 325 {
 326         unsigned int mtu;
 327         struct inet6_dev *idev;
 328
 329         if (dst_metric_locked(dst, RTAX_MTU)) {
 330                 mtu = dst_metric_raw(dst, RTAX_MTU);
 331                 if (mtu)
 332                         return mtu;
 333         }
 334
 335         mtu = IPV6_MIN_MTU;
 336         rcu_read_lock();
 337         idev = __in6_dev_get(dst->dev);
 338         if (idev)
 339                 mtu = idev->cnf.mtu6;
 340         rcu_read_unlock();
 341
 342         return mtu;
 343 }
 344
 345 int ip6_forward(struct sk_buff *skb)
 346 {
 347         struct dst_entry *dst = skb_dst(skb);
 348         struct ipv6hdr *hdr = ipv6_hdr(skb);
 349         struct inet6_skb_parm *opt = IP6CB(skb);
 350         struct net *net = dev_net(dst->dev);
 351         u32 mtu;
 352
 353         if (net->ipv6.devconf_all->forwarding == 0)
 354                 goto error;
 355
 356         if (skb_warn_if_lro(skb))
 357                 goto drop;
 358
 359         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 360                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 361                                  IPSTATS_MIB_INDISCARDS);
 362                 goto drop;
 363         }
 364
 365         if (skb->pkt_type != PACKET_HOST)
 366                 goto drop;
 367
 368         skb_forward_csum(skb);
 369
 370         /*
 371          *      We DO NOT make any processing on
 372          *      RA packets, pushing them to user level AS IS
 373          *      without ane WARRANTY that application will be able
 374          *      to interpret them. The reason is that we
 375          *      cannot make anything clever here.
 376          *
 377          *      We are not end-node, so that if packet contains
 378          *      AH/ESP, we cannot make anything.
 379          *      Defragmentation also would be mistake, RA packets
 380          *      cannot be fragmented, because there is no warranty
 381          *      that different fragments will go along one path. --ANK
 382          */
 383         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 384                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 385                         return 0;
 386         }
 387
 388         /*
 389          *      check and decrement ttl
 390          */
 391         if (hdr->hop_limit <= 1) {
 392                 /* Force OUTPUT device used as source address */
 393                 skb->dev = dst->dev;
 394                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 395                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 396                                  IPSTATS_MIB_INHDRERRORS);
 397
 398                 kfree_skb(skb);
 399                 return -ETIMEDOUT;
 400         }
 401
 402         /* XXX: idev->cnf.proxy_ndp? */
 403         if (net->ipv6.devconf_all->proxy_ndp &&
 404             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 405                 int proxied = ip6_forward_proxy_check(skb);
 406                 if (proxied > 0)
 407                         return ip6_input(skb);
 408                 else if (proxied < 0) {
 409                         IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 410                                          IPSTATS_MIB_INDISCARDS);
 411                         goto drop;
 412                 }
 413         }
 414
 415         if (!xfrm6_route_forward(skb)) {
 416                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 417                                  IPSTATS_MIB_INDISCARDS);
 418                 goto drop;
 419         }
 420         dst = skb_dst(skb);
 421
 422         /* IPv6 specs say nothing about it, but it is clear that we cannot
 423            send redirects to source routed frames.
 424            We don't send redirects to frames decapsulated from IPsec.
 425          */
 426         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 427                 struct in6_addr *target = NULL;
 428                 struct inet_peer *peer;
 429                 struct rt6_info *rt;
 430
 431                 /*
 432                  *      incoming and outgoing devices are the same
 433                  *      send a redirect.
 434                  */
 435
 436                 rt = (struct rt6_info *) dst;
 437                 if (rt->rt6i_flags & RTF_GATEWAY)
 438                         target = &rt->rt6i_gateway;
 439                 else
 440                         target = &hdr->daddr;
 441
 442                 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
 443
 444                 /* Limit redirects both by destination (here)
 445                    and by source (inside ndisc_send_redirect)
 446                  */
 447                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 448                         ndisc_send_redirect(skb, target);
 449                 if (peer)
 450                         inet_putpeer(peer);
 451         } else {
 452                 int addrtype = ipv6_addr_type(&hdr->saddr);
 453
 454                 /* This check is security critical. */
 455                 if (addrtype == IPV6_ADDR_ANY ||
 456                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 457                         goto error;
 458                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 459                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 460                                     ICMPV6_NOT_NEIGHBOUR, 0);
 461                         goto error;
 462                 }
 463         }
 464
 465         mtu = ip6_dst_mtu_forward(dst);
 466         if (mtu < IPV6_MIN_MTU)
 467                 mtu = IPV6_MIN_MTU;
 468
 469         if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
 470             (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
 471                 /* Again, force OUTPUT device used as source address */
 472                 skb->dev = dst->dev;
 473                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 474                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 475                                  IPSTATS_MIB_INTOOBIGERRORS);
 476                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 477                                  IPSTATS_MIB_FRAGFAILS);
 478                 kfree_skb(skb);
 479                 return -EMSGSIZE;
 480         }
 481
 482         if (skb_cow(skb, dst->dev->hard_header_len)) {
 483                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 484                                  IPSTATS_MIB_OUTDISCARDS);
 485                 goto drop;
 486         }
 487
 488         hdr = ipv6_hdr(skb);
 489
 490         /* Mangling hops number delayed to point after skb COW */
 491
 492         hdr->hop_limit--;
 493
 494         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 495         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 496         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 497                        ip6_forward_finish);
 498
 499 error:
 500         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 501 drop:
 502         kfree_skb(skb);
 503         return -EINVAL;
 504 }
 505
 506 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 507 {
 508         to->pkt_type = from->pkt_type;
 509         to->priority = from->priority;
 510         to->protocol = from->protocol;
 511         skb_dst_drop(to);
 512         skb_dst_set(to, dst_clone(skb_dst(from)));
 513         to->dev = from->dev;
 514         to->mark = from->mark;
 515
 516 #ifdef CONFIG_NET_SCHED
 517         to->tc_index = from->tc_index;
 518 #endif
 519         nf_copy(to, from);
 520 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
 521         to->nf_trace = from->nf_trace;
 522 #endif
 523         skb_copy_secmark(to, from);
 524 }
 525
 526 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 527 {
 528         struct sk_buff *frag;
 529         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 530         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 531         struct ipv6hdr *tmp_hdr;
 532         struct frag_hdr *fh;
 533         unsigned int mtu, hlen, left, len;
 534         int hroom, troom;
 535         __be32 frag_id = 0;
 536         int ptr, offset = 0, err=0;
 537         u8 *prevhdr, nexthdr = 0;
 538         struct net *net = dev_net(skb_dst(skb)->dev);
 539
 540         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 541         nexthdr = *prevhdr;
 542
 543         mtu = ip6_skb_dst_mtu(skb);
 544
 545         /* We must not fragment if the socket is set to force MTU discovery
 546          * or if the skb it not generated by a local socket.
 547          */
 548         if (unlikely(!skb->local_df && skb->len > mtu) ||
 549                      (IP6CB(skb)->frag_max_size &&
 550                       IP6CB(skb)->frag_max_size > mtu)) {
 551                 if (skb->sk && dst_allfrag(skb_dst(skb)))
 552                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 553
 554                 skb->dev = skb_dst(skb)->dev;
 555                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 556                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 557                               IPSTATS_MIB_FRAGFAILS);
 558                 kfree_skb(skb);
 559                 return -EMSGSIZE;
 560         }
 561
 562         if (np && np->frag_size < mtu) {
 563                 if (np->frag_size)
 564                         mtu = np->frag_size;
 565         }
 566         mtu -= hlen + sizeof(struct frag_hdr);
 567
 568         if (skb_has_frag_list(skb)) {
 569                 int first_len = skb_pagelen(skb);
 570                 struct sk_buff *frag2;
 571
 572                 if (first_len - hlen > mtu ||
 573                     ((first_len - hlen) & 7) ||
 574                     skb_cloned(skb))
 575                         goto slow_path;
 576
 577                 skb_walk_frags(skb, frag) {
 578                         /* Correct geometry. */
 579                         if (frag->len > mtu ||
 580                             ((frag->len & 7) && frag->next) ||
 581                             skb_headroom(frag) < hlen)
 582                                 goto slow_path_clean;
 583
 584                         /* Partially cloned skb? */
 585                         if (skb_shared(frag))
 586                                 goto slow_path_clean;
 587
 588                         BUG_ON(frag->sk);
 589                         if (skb->sk) {
 590                                 frag->sk = skb->sk;
 591                                 frag->destructor = sock_wfree;
 592                         }
 593                         skb->truesize -= frag->truesize;
 594                 }
 595
 596                 err = 0;
 597                 offset = 0;
 598                 frag = skb_shinfo(skb)->frag_list;
 599                 skb_frag_list_init(skb);
 600                 /* BUILD HEADER */
 601
 602                 *prevhdr = NEXTHDR_FRAGMENT;
 603                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 604                 if (!tmp_hdr) {
 605                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 606                                       IPSTATS_MIB_FRAGFAILS);
 607                         return -ENOMEM;
 608                 }
 609
 610                 __skb_pull(skb, hlen);
 611                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 612                 __skb_push(skb, hlen);
 613                 skb_reset_network_header(skb);
 614                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 615
 616                 ipv6_select_ident(fh, rt);
 617                 fh->nexthdr = nexthdr;
 618                 fh->reserved = 0;
 619                 fh->frag_off = htons(IP6_MF);
 620                 frag_id = fh->identification;
 621
 622                 first_len = skb_pagelen(skb);
 623                 skb->data_len = first_len - skb_headlen(skb);
 624                 skb->len = first_len;
 625                 ipv6_hdr(skb)->payload_len = htons(first_len -
 626                                                    sizeof(struct ipv6hdr));
 627
 628                 dst_hold(&rt->dst);
 629
 630                 for (;;) {
 631                         /* Prepare header of the next frame,
 632                          * before previous one went down. */
 633                         if (frag) {
 634                                 frag->ip_summed = CHECKSUM_NONE;
 635                                 skb_reset_transport_header(frag);
 636                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 637                                 __skb_push(frag, hlen);
 638                                 skb_reset_network_header(frag);
 639                                 memcpy(skb_network_header(frag), tmp_hdr,
 640                                        hlen);
 641                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 642                                 fh->nexthdr = nexthdr;
 643                                 fh->reserved = 0;
 644                                 fh->frag_off = htons(offset);
 645                                 if (frag->next != NULL)
 646                                         fh->frag_off |= htons(IP6_MF);
 647                                 fh->identification = frag_id;
 648                                 ipv6_hdr(frag)->payload_len =
 649                                                 htons(frag->len -
 650                                                       sizeof(struct ipv6hdr));
 651                                 ip6_copy_metadata(frag, skb);
 652                         }
 653
 654                         err = output(skb);
 655                         if(!err)
 656                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 657                                               IPSTATS_MIB_FRAGCREATES);
 658
 659                         if (err || !frag)
 660                                 break;
 661
 662                         skb = frag;
 663                         frag = skb->next;
 664                         skb->next = NULL;
 665                 }
 666
 667                 kfree(tmp_hdr);
 668
 669                 if (err == 0) {
 670                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 671                                       IPSTATS_MIB_FRAGOKS);
 672                         ip6_rt_put(rt);
 673                         return 0;
 674                 }
 675
 676                 while (frag) {
 677                         skb = frag->next;
 678                         kfree_skb(frag);
 679                         frag = skb;
 680                 }
 681
 682                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 683                               IPSTATS_MIB_FRAGFAILS);
 684                 ip6_rt_put(rt);
 685                 return err;
 686
 687 slow_path_clean:
 688                 skb_walk_frags(skb, frag2) {
 689                         if (frag2 == frag)
 690                                 break;
 691                         frag2->sk = NULL;
 692                         frag2->destructor = NULL;
 693                         skb->truesize += frag2->truesize;
 694                 }
 695         }
 696
 697 slow_path:
 698         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
 699             skb_checksum_help(skb))
 700                 goto fail;
 701
 702         left = skb->len - hlen;         /* Space per frame */
 703         ptr = hlen;                     /* Where to start from */
 704
 705         /*
 706          *      Fragment the datagram.
 707          */
 708
 709         *prevhdr = NEXTHDR_FRAGMENT;
 710         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 711         troom = rt->dst.dev->needed_tailroom;
 712
 713         /*
 714          *      Keep copying data until we run out.
 715          */
 716         while(left > 0) {
 717                 len = left;
 718                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 719                 if (len > mtu)
 720                         len = mtu;
 721                 /* IF: we are not sending up to and including the packet end
 722                    then align the next start on an eight byte boundary */
 723                 if (len < left) {
 724                         len &= ~7;
 725                 }
 726                 /*
 727                  *      Allocate buffer.
 728                  */
 729
 730                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 731                                       hroom + troom, GFP_ATOMIC)) == NULL) {
 732                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 733                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 734                                       IPSTATS_MIB_FRAGFAILS);
 735                         err = -ENOMEM;
 736                         goto fail;
 737                 }
 738
 739                 /*
 740                  *      Set up data on packet
 741                  */
 742
 743                 ip6_copy_metadata(frag, skb);
 744                 skb_reserve(frag, hroom);
 745                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 746                 skb_reset_network_header(frag);
 747                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 748                 frag->transport_header = (frag->network_header + hlen +
 749                                           sizeof(struct frag_hdr));
 750
 751                 /*
 752                  *      Charge the memory for the fragment to any owner
 753                  *      it might possess
 754                  */
 755                 if (skb->sk)
 756                         skb_set_owner_w(frag, skb->sk);
 757
 758                 /*
 759                  *      Copy the packet header into the new buffer.
 760                  */
 761                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 762
 763                 /*
 764                  *      Build fragment header.
 765                  */
 766                 fh->nexthdr = nexthdr;
 767                 fh->reserved = 0;
 768                 if (!frag_id) {
 769                         ipv6_select_ident(fh, rt);
 770                         frag_id = fh->identification;
 771                 } else
 772                         fh->identification = frag_id;
 773
 774                 /*
 775                  *      Copy a block of the IP datagram.
 776                  */
 777                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 778                         BUG();
 779                 left -= len;
 780
 781                 fh->frag_off = htons(offset);
 782                 if (left > 0)
 783                         fh->frag_off |= htons(IP6_MF);
 784                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 785                                                     sizeof(struct ipv6hdr));
 786
 787                 ptr += len;
 788                 offset += len;
 789
 790                 /*
 791                  *      Put this fragment into the sending queue.
 792                  */
 793                 err = output(frag);
 794                 if (err)
 795                         goto fail;
 796
 797                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 798                               IPSTATS_MIB_FRAGCREATES);
 799         }
 800         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 801                       IPSTATS_MIB_FRAGOKS);
 802         consume_skb(skb);
 803         return err;
 804
 805 fail:
 806         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 807                       IPSTATS_MIB_FRAGFAILS);
 808         kfree_skb(skb);
 809         return err;
 810 }
 811
 812 static inline int ip6_rt_check(const struct rt6key *rt_key,
 813                                const struct in6_addr *fl_addr,
 814                                const struct in6_addr *addr_cache)
 815 {
 816         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 817                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 818 }
 819
 820 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 821                                           struct dst_entry *dst,
 822                                           const struct flowi6 *fl6)
 823 {
 824         struct ipv6_pinfo *np = inet6_sk(sk);
 825         struct rt6_info *rt;
 826
 827         if (!dst)
 828                 goto out;
 829
 830         if (dst->ops->family != AF_INET6) {
 831                 dst_release(dst);
 832                 return NULL;
 833         }
 834
 835         rt = (struct rt6_info *)dst;
 836         /* Yes, checking route validity in not connected
 837          * case is not very simple. Take into account,
 838          * that we do not support routing by source, TOS,
 839          * and MSG_DONTROUTE            --ANK (980726)
 840          *
 841          * 1. ip6_rt_check(): If route was host route,
 842          *    check that cached destination is current.
 843          *    If it is network route, we still may
 844          *    check its validity using saved pointer
 845          *    to the last used address: daddr_cache.
 846          *    We do not want to save whole address now,
 847          *    (because main consumer of this service
 848          *    is tcp, which has not this problem),
 849          *    so that the last trick works only on connected
 850          *    sockets.
 851          * 2. oif also should be the same.
 852          */
 853         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 854 #ifdef CONFIG_IPV6_SUBTREES
 855             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 856 #endif
 857             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 858                 dst_release(dst);
 859                 dst = NULL;
 860         }
 861
 862 out:
 863         return dst;
 864 }
 865
 866 static int ip6_dst_lookup_tail(struct sock *sk,
 867                                struct dst_entry **dst, struct flowi6 *fl6)
 868 {
 869         struct net *net = sock_net(sk);
 870 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 871         struct neighbour *n;
 872         struct rt6_info *rt;
 873 #endif
 874         int err;
 875
 876         if (*dst == NULL)
 877                 *dst = ip6_route_output(net, sk, fl6);
 878
 879         if ((err = (*dst)->error))
 880                 goto out_err_release;
 881
 882         if (ipv6_addr_any(&fl6->saddr)) {
 883                 struct rt6_info *rt = (struct rt6_info *) *dst;
 884                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 885                                           sk ? inet6_sk(sk)->srcprefs : 0,
 886                                           &fl6->saddr);
 887                 if (err)
 888                         goto out_err_release;
 889         }
 890
 891 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 892         /*
 893          * Here if the dst entry we've looked up
 894          * has a neighbour entry that is in the INCOMPLETE
 895          * state and the src address from the flow is
 896          * marked as OPTIMISTIC, we release the found
 897          * dst entry and replace it instead with the
 898          * dst entry of the nexthop router
 899          */
 900         rt = (struct rt6_info *) *dst;
 901         rcu_read_lock_bh();
 902         n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
 903         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 904         rcu_read_unlock_bh();
 905
 906         if (err) {
 907                 struct inet6_ifaddr *ifp;
 908                 struct flowi6 fl_gw6;
 909                 int redirect;
 910
 911                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 912                                       (*dst)->dev, 1);
 913
 914                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 915                 if (ifp)
 916                         in6_ifa_put(ifp);
 917
 918                 if (redirect) {
 919                         /*
 920                          * We need to get the dst entry for the
 921                          * default router instead
 922                          */
 923                         dst_release(*dst);
 924                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
 925                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
 926                         *dst = ip6_route_output(net, sk, &fl_gw6);
 927                         if ((err = (*dst)->error))
 928                                 goto out_err_release;
 929                 }
 930         }
 931 #endif
 932
 933         return 0;
 934
 935 out_err_release:
 936         if (err == -ENETUNREACH)
 937                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 938         dst_release(*dst);
 939         *dst = NULL;
 940         return err;
 941 }
 942
 943 /**
 944  *      ip6_dst_lookup - perform route lookup on flow
 945  *      @sk: socket which provides route info
 946  *      @dst: pointer to dst_entry * for result
 947  *      @fl6: flow to lookup
 948  *
 949  *      This function performs a route lookup on the given flow.
 950  *
 951  *      It returns zero on success, or a standard errno code on error.
 952  */
 953 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
 954 {
 955         *dst = NULL;
 956         return ip6_dst_lookup_tail(sk, dst, fl6);
 957 }
 958 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
 959
 960 /**
 961  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
 962  *      @sk: socket which provides route info
 963  *      @fl6: flow to lookup
 964  *      @final_dst: final destination address for ipsec lookup
 965  *
 966  *      This function performs a route lookup on the given flow.
 967  *
 968  *      It returns a valid dst pointer on success, or a pointer encoded
 969  *      error code.
 970  */
 971 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
 972                                       const struct in6_addr *final_dst)
 973 {
 974         struct dst_entry *dst = NULL;
 975         int err;
 976
 977         err = ip6_dst_lookup_tail(sk, &dst, fl6);
 978         if (err)
 979                 return ERR_PTR(err);
 980         if (final_dst)
 981                 fl6->daddr = *final_dst;
 982
 983         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
 984 }
 985 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
 986
 987 /**
 988  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
 989  *      @sk: socket which provides the dst cache and route info
 990  *      @fl6: flow to lookup
 991  *      @final_dst: final destination address for ipsec lookup
 992  *
 993  *      This function performs a route lookup on the given flow with the
 994  *      possibility of using the cached route in the socket if it is valid.
 995  *      It will take the socket dst lock when operating on the dst cache.
 996  *      As a result, this function can only be used in process context.
 997  *
 998  *      It returns a valid dst pointer on success, or a pointer encoded
 999  *      error code.
1000  */
1001 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1002                                          const struct in6_addr *final_dst)
1003 {
1004         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1005         int err;
1006
1007         dst = ip6_sk_dst_check(sk, dst, fl6);
1008
1009         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1010         if (err)
1011                 return ERR_PTR(err);
1012         if (final_dst)
1013                 fl6->daddr = *final_dst;
1014
1015         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1016 }
1017 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1018
1019 static inline int ip6_ufo_append_data(struct sock *sk,
1020                         int getfrag(void *from, char *to, int offset, int len,
1021                         int odd, struct sk_buff *skb),
1022                         void *from, int length, int hh_len, int fragheaderlen,
1023                         int transhdrlen, int mtu,unsigned int flags,
1024                         struct rt6_info *rt)
1025
1026 {
1027         struct sk_buff *skb;
1028         struct frag_hdr fhdr;
1029         int err;
1030
1031         /* There is support for UDP large send offload by network
1032          * device, so create one single skb packet containing complete
1033          * udp datagram
1034          */
1035         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1036                 skb = sock_alloc_send_skb(sk,
1037                         hh_len + fragheaderlen + transhdrlen + 20,
1038                         (flags & MSG_DONTWAIT), &err);
1039                 if (skb == NULL)
1040                         return err;
1041
1042                 /* reserve space for Hardware header */
1043                 skb_reserve(skb, hh_len);
1044
1045                 /* create space for UDP/IP header */
1046                 skb_put(skb,fragheaderlen + transhdrlen);
1047
1048                 /* initialize network header pointer */
1049                 skb_reset_network_header(skb);
1050
1051                 /* initialize protocol header pointer */
1052                 skb->transport_header = skb->network_header + fragheaderlen;
1053
1054                 skb->protocol = htons(ETH_P_IPV6);
1055                 skb->csum = 0;
1056
1057                 __skb_queue_tail(&sk->sk_write_queue, skb);
1058         } else if (skb_is_gso(skb)) {
1059                 goto append;
1060         }
1061
1062         skb->ip_summed = CHECKSUM_PARTIAL;
1063         /* Specify the length of each IPv6 datagram fragment.
1064          * It has to be a multiple of 8.
1065          */
1066         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1067                                      sizeof(struct frag_hdr)) & ~7;
1068         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1069         ipv6_select_ident(&fhdr, rt);
1070         skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1071
1072 append:
1073         return skb_append_datato_frags(sk, skb, getfrag, from,
1074                                        (length - transhdrlen));
1075 }
1076
1077 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1078                                                gfp_t gfp)
1079 {
1080         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1081 }
1082
1083 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1084                                                 gfp_t gfp)
1085 {
1086         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1087 }
1088
1089 static void ip6_append_data_mtu(unsigned int *mtu,
1090                                 int *maxfraglen,
1091                                 unsigned int fragheaderlen,
1092                                 struct sk_buff *skb,
1093                                 struct rt6_info *rt,
1094                                 bool pmtuprobe)
1095 {
1096         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1097                 if (skb == NULL) {
1098                         /* first fragment, reserve header_len */
1099                         *mtu = *mtu - rt->dst.header_len;
1100
1101                 } else {
1102                         /*
1103                          * this fragment is not first, the headers
1104                          * space is regarded as data space.
1105                          */
1106                         *mtu = min(*mtu, pmtuprobe ?
1107                                    rt->dst.dev->mtu :
1108                                    dst_mtu(rt->dst.path));
1109                 }
1110                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1111                               + fragheaderlen - sizeof(struct frag_hdr);
1112         }
1113 }
1114
1115 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1116         int offset, int len, int odd, struct sk_buff *skb),
1117         void *from, int length, int transhdrlen,
1118         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1119         struct rt6_info *rt, unsigned int flags, int dontfrag)
1120 {
1121         struct inet_sock *inet = inet_sk(sk);
1122         struct ipv6_pinfo *np = inet6_sk(sk);
1123         struct inet_cork *cork;
1124         struct sk_buff *skb, *skb_prev = NULL;
1125         unsigned int maxfraglen, fragheaderlen, mtu;
1126         int exthdrlen;
1127         int dst_exthdrlen;
1128         int hh_len;
1129         int copy;
1130         int err;
1131         int offset = 0;
1132         __u8 tx_flags = 0;
1133
1134         if (flags&MSG_PROBE)
1135                 return 0;
1136         cork = &inet->cork.base;
1137         if (skb_queue_empty(&sk->sk_write_queue)) {
1138                 /*
1139                  * setup for corking
1140                  */
1141                 if (opt) {
1142                         if (WARN_ON(np->cork.opt))
1143                                 return -EINVAL;
1144
1145                         np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1146                         if (unlikely(np->cork.opt == NULL))
1147                                 return -ENOBUFS;
1148
1149                         np->cork.opt->tot_len = opt->tot_len;
1150                         np->cork.opt->opt_flen = opt->opt_flen;
1151                         np->cork.opt->opt_nflen = opt->opt_nflen;
1152
1153                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1154                                                             sk->sk_allocation);
1155                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1156                                 return -ENOBUFS;
1157
1158                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1159                                                             sk->sk_allocation);
1160                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1161                                 return -ENOBUFS;
1162
1163                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1164                                                            sk->sk_allocation);
1165                         if (opt->hopopt && !np->cork.opt->hopopt)
1166                                 return -ENOBUFS;
1167
1168                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1169                                                             sk->sk_allocation);
1170                         if (opt->srcrt && !np->cork.opt->srcrt)
1171                                 return -ENOBUFS;
1172
1173                         /* need source address above miyazawa*/
1174                 }
1175                 dst_hold(&rt->dst);
1176                 cork->dst = &rt->dst;
1177                 inet->cork.fl.u.ip6 = *fl6;
1178                 np->cork.hop_limit = hlimit;
1179                 np->cork.tclass = tclass;
1180                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1181                         mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1182                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1183                 else
1184                         mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1185                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1186                 if (np->frag_size < mtu) {
1187                         if (np->frag_size)
1188                                 mtu = np->frag_size;
1189                 }
1190                 cork->fragsize = mtu;
1191                 if (dst_allfrag(rt->dst.path))
1192                         cork->flags |= IPCORK_ALLFRAG;
1193                 cork->length = 0;
1194                 exthdrlen = (opt ? opt->opt_flen : 0);
1195                 length += exthdrlen;
1196                 transhdrlen += exthdrlen;
1197                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1198         } else {
1199                 rt = (struct rt6_info *)cork->dst;
1200                 fl6 = &inet->cork.fl.u.ip6;
1201                 opt = np->cork.opt;
1202                 transhdrlen = 0;
1203                 exthdrlen = 0;
1204                 dst_exthdrlen = 0;
1205                 mtu = cork->fragsize;
1206         }
1207
1208         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1209
1210         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1211                         (opt ? opt->opt_nflen : 0);
1212         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1213                      sizeof(struct frag_hdr);
1214
1215         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1216                 unsigned int maxnonfragsize, headersize;
1217
1218                 headersize = sizeof(struct ipv6hdr) +
1219                              (opt ? opt->tot_len : 0) +
1220                              (dst_allfrag(&rt->dst) ?
1221                               sizeof(struct frag_hdr) : 0) +
1222                              rt->rt6i_nfheader_len;
1223
1224                 maxnonfragsize = (np->pmtudisc >= IPV6_PMTUDISC_DO) ?
1225                                  mtu : sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1226
1227                 /* dontfrag active */
1228                 if ((cork->length + length > mtu - headersize) && dontfrag &&
1229                     (sk->sk_protocol == IPPROTO_UDP ||
1230                      sk->sk_protocol == IPPROTO_RAW)) {
1231                         ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1232                                                    sizeof(struct ipv6hdr));
1233                         goto emsgsize;
1234                 }
1235
1236                 if (cork->length + length > maxnonfragsize - headersize) {
1237 emsgsize:
1238                         ipv6_local_error(sk, EMSGSIZE, fl6,
1239                                          mtu - headersize +
1240                                          sizeof(struct ipv6hdr));
1241                         return -EMSGSIZE;
1242                 }
1243         }
1244
1245         /* For UDP, check if TX timestamp is enabled */
1246         if (sk->sk_type == SOCK_DGRAM)
1247                 sock_tx_timestamp(sk, &tx_flags);
1248
1249         /*
1250          * Let's try using as much space as possible.
1251          * Use MTU if total length of the message fits into the MTU.
1252          * Otherwise, we need to reserve fragment header and
1253          * fragment alignment (= 8-15 octects, in total).
1254          *
1255          * Note that we may need to "move" the data from the tail of
1256          * of the buffer to the new fragment when we split
1257          * the message.
1258          *
1259          * FIXME: It may be fragmented into multiple chunks
1260          *        at once if non-fragmentable extension headers
1261          *        are too large.
1262          * --yoshfuji
1263          */
1264
1265         skb = skb_peek_tail(&sk->sk_write_queue);
1266         cork->length += length;
1267         if (((length > mtu) ||
1268              (skb && skb_is_gso(skb))) &&
1269             (sk->sk_protocol == IPPROTO_UDP) &&
1270             (rt->dst.dev->features & NETIF_F_UFO)) {
1271                 err = ip6_ufo_append_data(sk, getfrag, from, length,
1272                                           hh_len, fragheaderlen,
1273                                           transhdrlen, mtu, flags, rt);
1274                 if (err)
1275                         goto error;
1276                 return 0;
1277         }
1278
1279         if (!skb)
1280                 goto alloc_new_skb;
1281
1282         while (length > 0) {
1283                 /* Check if the remaining data fits into current packet. */
1284                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1285                 if (copy < length)
1286                         copy = maxfraglen - skb->len;
1287
1288                 if (copy <= 0) {
1289                         char *data;
1290                         unsigned int datalen;
1291                         unsigned int fraglen;
1292                         unsigned int fraggap;
1293                         unsigned int alloclen;
1294 alloc_new_skb:
1295                         /* There's no room in the current skb */
1296                         if (skb)
1297                                 fraggap = skb->len - maxfraglen;
1298                         else
1299                                 fraggap = 0;
1300                         /* update mtu and maxfraglen if necessary */
1301                         if (skb == NULL || skb_prev == NULL)
1302                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1303                                                     fragheaderlen, skb, rt,
1304                                                     np->pmtudisc >=
1305                                                     IPV6_PMTUDISC_PROBE);
1306
1307                         skb_prev = skb;
1308
1309                         /*
1310                          * If remaining data exceeds the mtu,
1311                          * we know we need more fragment(s).
1312                          */
1313                         datalen = length + fraggap;
1314
1315                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1316                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1317                         if ((flags & MSG_MORE) &&
1318                             !(rt->dst.dev->features&NETIF_F_SG))
1319                                 alloclen = mtu;
1320                         else
1321                                 alloclen = datalen + fragheaderlen;
1322
1323                         alloclen += dst_exthdrlen;
1324
1325                         if (datalen != length + fraggap) {
1326                                 /*
1327                                  * this is not the last fragment, the trailer
1328                                  * space is regarded as data space.
1329                                  */
1330                                 datalen += rt->dst.trailer_len;
1331                         }
1332
1333                         alloclen += rt->dst.trailer_len;
1334                         fraglen = datalen + fragheaderlen;
1335
1336                         /*
1337                          * We just reserve space for fragment header.
1338                          * Note: this may be overallocation if the message
1339                          * (without MSG_MORE) fits into the MTU.
1340                          */
1341                         alloclen += sizeof(struct frag_hdr);
1342
1343                         if (transhdrlen) {
1344                                 skb = sock_alloc_send_skb(sk,
1345                                                 alloclen + hh_len,
1346                                                 (flags & MSG_DONTWAIT), &err);
1347                         } else {
1348                                 skb = NULL;
1349                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1350                                     2 * sk->sk_sndbuf)
1351                                         skb = sock_wmalloc(sk,
1352                                                            alloclen + hh_len, 1,
1353                                                            sk->sk_allocation);
1354                                 if (unlikely(skb == NULL))
1355                                         err = -ENOBUFS;
1356                                 else {
1357                                         /* Only the initial fragment
1358                                          * is time stamped.
1359                                          */
1360                                         tx_flags = 0;
1361                                 }
1362                         }
1363                         if (skb == NULL)
1364                                 goto error;
1365                         /*
1366                          *      Fill in the control structures
1367                          */
1368                         skb->protocol = htons(ETH_P_IPV6);
1369                         skb->ip_summed = CHECKSUM_NONE;
1370                         skb->csum = 0;
1371                         /* reserve for fragmentation and ipsec header */
1372                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1373                                     dst_exthdrlen);
1374
1375                         if (sk->sk_type == SOCK_DGRAM)
1376                                 skb_shinfo(skb)->tx_flags = tx_flags;
1377
1378                         /*
1379                          *      Find where to start putting bytes
1380                          */
1381                         data = skb_put(skb, fraglen);
1382                         skb_set_network_header(skb, exthdrlen);
1383                         data += fragheaderlen;
1384                         skb->transport_header = (skb->network_header +
1385                                                  fragheaderlen);
1386                         if (fraggap) {
1387                                 skb->csum = skb_copy_and_csum_bits(
1388                                         skb_prev, maxfraglen,
1389                                         data + transhdrlen, fraggap, 0);
1390                                 skb_prev->csum = csum_sub(skb_prev->csum,
1391                                                           skb->csum);
1392                                 data += fraggap;
1393                                 pskb_trim_unique(skb_prev, maxfraglen);
1394                         }
1395                         copy = datalen - transhdrlen - fraggap;
1396
1397                         if (copy < 0) {
1398                                 err = -EINVAL;
1399                                 kfree_skb(skb);
1400                                 goto error;
1401                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1402                                 err = -EFAULT;
1403                                 kfree_skb(skb);
1404                                 goto error;
1405                         }
1406
1407                         offset += copy;
1408                         length -= datalen - fraggap;
1409                         transhdrlen = 0;
1410                         exthdrlen = 0;
1411                         dst_exthdrlen = 0;
1412
1413                         /*
1414                          * Put the packet on the pending queue
1415                          */
1416                         __skb_queue_tail(&sk->sk_write_queue, skb);
1417                         continue;
1418                 }
1419
1420                 if (copy > length)
1421                         copy = length;
1422
1423                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1424                         unsigned int off;
1425
1426                         off = skb->len;
1427                         if (getfrag(from, skb_put(skb, copy),
1428                                                 offset, copy, off, skb) < 0) {
1429                                 __skb_trim(skb, off);
1430                                 err = -EFAULT;
1431                                 goto error;
1432                         }
1433                 } else {
1434                         int i = skb_shinfo(skb)->nr_frags;
1435                         struct page_frag *pfrag = sk_page_frag(sk);
1436
1437                         err = -ENOMEM;
1438                         if (!sk_page_frag_refill(sk, pfrag))
1439                                 goto error;
1440
1441                         if (!skb_can_coalesce(skb, i, pfrag->page,
1442                                               pfrag->offset)) {
1443                                 err = -EMSGSIZE;
1444                                 if (i == MAX_SKB_FRAGS)
1445                                         goto error;
1446
1447                                 __skb_fill_page_desc(skb, i, pfrag->page,
1448                                                      pfrag->offset, 0);
1449                                 skb_shinfo(skb)->nr_frags = ++i;
1450                                 get_page(pfrag->page);
1451                         }
1452                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1453                         if (getfrag(from,
1454                                     page_address(pfrag->page) + pfrag->offset,
1455                                     offset, copy, skb->len, skb) < 0)
1456                                 goto error_efault;
1457
1458                         pfrag->offset += copy;
1459                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1460                         skb->len += copy;
1461                         skb->data_len += copy;
1462                         skb->truesize += copy;
1463                         atomic_add(copy, &sk->sk_wmem_alloc);
1464                 }
1465                 offset += copy;
1466                 length -= copy;
1467         }
1468
1469         return 0;
1470
1471 error_efault:
1472         err = -EFAULT;
1473 error:
1474         cork->length -= length;
1475         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1476         return err;
1477 }
1478 EXPORT_SYMBOL_GPL(ip6_append_data);
1479
1480 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1481 {
1482         if (np->cork.opt) {
1483                 kfree(np->cork.opt->dst0opt);
1484                 kfree(np->cork.opt->dst1opt);
1485                 kfree(np->cork.opt->hopopt);
1486                 kfree(np->cork.opt->srcrt);
1487                 kfree(np->cork.opt);
1488                 np->cork.opt = NULL;
1489         }
1490
1491         if (inet->cork.base.dst) {
1492                 dst_release(inet->cork.base.dst);
1493                 inet->cork.base.dst = NULL;
1494                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1495         }
1496         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1497 }
1498
1499 int ip6_push_pending_frames(struct sock *sk)
1500 {
1501         struct sk_buff *skb, *tmp_skb;
1502         struct sk_buff **tail_skb;
1503         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1504         struct inet_sock *inet = inet_sk(sk);
1505         struct ipv6_pinfo *np = inet6_sk(sk);
1506         struct net *net = sock_net(sk);
1507         struct ipv6hdr *hdr;
1508         struct ipv6_txoptions *opt = np->cork.opt;
1509         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1510         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1511         unsigned char proto = fl6->flowi6_proto;
1512         int err = 0;
1513
1514         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1515                 goto out;
1516         tail_skb = &(skb_shinfo(skb)->frag_list);
1517
1518         /* move skb->data to ip header from ext header */
1519         if (skb->data < skb_network_header(skb))
1520                 __skb_pull(skb, skb_network_offset(skb));
1521         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1522                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1523                 *tail_skb = tmp_skb;
1524                 tail_skb = &(tmp_skb->next);
1525                 skb->len += tmp_skb->len;
1526                 skb->data_len += tmp_skb->len;
1527                 skb->truesize += tmp_skb->truesize;
1528                 tmp_skb->destructor = NULL;
1529                 tmp_skb->sk = NULL;
1530         }
1531
1532         /* Allow local fragmentation. */
1533         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1534                 skb->local_df = 1;
1535
1536         *final_dst = fl6->daddr;
1537         __skb_pull(skb, skb_network_header_len(skb));
1538         if (opt && opt->opt_flen)
1539                 ipv6_push_frag_opts(skb, opt, &proto);
1540         if (opt && opt->opt_nflen)
1541                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1542
1543         skb_push(skb, sizeof(struct ipv6hdr));
1544         skb_reset_network_header(skb);
1545         hdr = ipv6_hdr(skb);
1546
1547         ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel);
1548         hdr->hop_limit = np->cork.hop_limit;
1549         hdr->nexthdr = proto;
1550         hdr->saddr = fl6->saddr;
1551         hdr->daddr = *final_dst;
1552
1553         skb->priority = sk->sk_priority;
1554         skb->mark = sk->sk_mark;
1555
1556         skb_dst_set(skb, dst_clone(&rt->dst));
1557         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1558         if (proto == IPPROTO_ICMPV6) {
1559                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1560
1561                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1562                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1563         }
1564
1565         err = ip6_local_out(skb);
1566         if (err) {
1567                 if (err > 0)
1568                         err = net_xmit_errno(err);
1569                 if (err)
1570                         goto error;
1571         }
1572
1573 out:
1574         ip6_cork_release(inet, np);
1575         return err;
1576 error:
1577         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1578         goto out;
1579 }
1580 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1581
1582 void ip6_flush_pending_frames(struct sock *sk)
1583 {
1584         struct sk_buff *skb;
1585
1586         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1587                 if (skb_dst(skb))
1588                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1589                                       IPSTATS_MIB_OUTDISCARDS);
1590                 kfree_skb(skb);
1591         }
1592
1593         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1594 }
1595 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);