net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 int __ip6_local_out(struct sk_buff *skb)
  60 {
  61         int len;
  62
  63         len = skb->len - sizeof(struct ipv6hdr);
  64         if (len > IPV6_MAXPLEN)
  65                 len = 0;
  66         ipv6_hdr(skb)->payload_len = htons(len);
  67
  68         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  69                        skb_dst(skb)->dev, dst_output);
  70 }
  71
  72 int ip6_local_out(struct sk_buff *skb)
  73 {
  74         int err;
  75
  76         err = __ip6_local_out(skb);
  77         if (likely(err == 1))
  78                 err = dst_output(skb);
  79
  80         return err;
  81 }
  82 EXPORT_SYMBOL_GPL(ip6_local_out);
  83
  84 static int ip6_finish_output2(struct sk_buff *skb)
  85 {
  86         struct dst_entry *dst = skb_dst(skb);
  87         struct net_device *dev = dst->dev;
  88         struct neighbour *neigh;
  89         struct in6_addr *nexthop;
  90         int ret;
  91
  92         skb->protocol = htons(ETH_P_IPV6);
  93         skb->dev = dev;
  94
  95         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  96                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  97
  98                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
  99                     ((mroute6_socket(dev_net(dev), skb) &&
 100                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 101                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 102                                          &ipv6_hdr(skb)->saddr))) {
 103                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 104
 105                         /* Do not check for IFF_ALLMULTI; multicast routing
 106                            is not supported in any case.
 107                          */
 108                         if (newskb)
 109                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 110                                         newskb, NULL, newskb->dev,
 111                                         dev_loopback_xmit);
 112
 113                         if (ipv6_hdr(skb)->hop_limit == 0) {
 114                                 IP6_INC_STATS(dev_net(dev), idev,
 115                                               IPSTATS_MIB_OUTDISCARDS);
 116                                 kfree_skb(skb);
 117                                 return 0;
 118                         }
 119                 }
 120
 121                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 122                                 skb->len);
 123
 124                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 125                     IPV6_ADDR_SCOPE_NODELOCAL &&
 126                     !(dev->flags & IFF_LOOPBACK)) {
 127                         kfree_skb(skb);
 128                         return 0;
 129                 }
 130         }
 131
 132         rcu_read_lock_bh();
 133         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 134         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 135         if (unlikely(!neigh))
 136                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 137         if (!IS_ERR(neigh)) {
 138                 ret = dst_neigh_output(dst, neigh, skb);
 139                 rcu_read_unlock_bh();
 140                 return ret;
 141         }
 142         rcu_read_unlock_bh();
 143
 144         IP6_INC_STATS_BH(dev_net(dst->dev),
 145                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 146         kfree_skb(skb);
 147         return -EINVAL;
 148 }
 149
 150 static int ip6_finish_output(struct sk_buff *skb)
 151 {
 152         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 153             dst_allfrag(skb_dst(skb)))
 154                 return ip6_fragment(skb, ip6_finish_output2);
 155         else
 156                 return ip6_finish_output2(skb);
 157 }
 158
 159 int ip6_output(struct sk_buff *skb)
 160 {
 161         struct net_device *dev = skb_dst(skb)->dev;
 162         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 163         if (unlikely(idev->cnf.disable_ipv6)) {
 164                 IP6_INC_STATS(dev_net(dev), idev,
 165                               IPSTATS_MIB_OUTDISCARDS);
 166                 kfree_skb(skb);
 167                 return 0;
 168         }
 169
 170         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 171                             ip6_finish_output,
 172                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 173 }
 174
 175 /*
 176  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 177  */
 178
 179 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 180              struct ipv6_txoptions *opt, int tclass)
 181 {
 182         struct net *net = sock_net(sk);
 183         struct ipv6_pinfo *np = inet6_sk(sk);
 184         struct in6_addr *first_hop = &fl6->daddr;
 185         struct dst_entry *dst = skb_dst(skb);
 186         struct ipv6hdr *hdr;
 187         u8  proto = fl6->flowi6_proto;
 188         int seg_len = skb->len;
 189         int hlimit = -1;
 190         u32 mtu;
 191
 192         if (opt) {
 193                 unsigned int head_room;
 194
 195                 /* First: exthdrs may take lots of space (~8K for now)
 196                    MAX_HEADER is not enough.
 197                  */
 198                 head_room = opt->opt_nflen + opt->opt_flen;
 199                 seg_len += head_room;
 200                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 201
 202                 if (skb_headroom(skb) < head_room) {
 203                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 204                         if (skb2 == NULL) {
 205                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 206                                               IPSTATS_MIB_OUTDISCARDS);
 207                                 kfree_skb(skb);
 208                                 return -ENOBUFS;
 209                         }
 210                         consume_skb(skb);
 211                         skb = skb2;
 212                         skb_set_owner_w(skb, sk);
 213                 }
 214                 if (opt->opt_flen)
 215                         ipv6_push_frag_opts(skb, opt, &proto);
 216                 if (opt->opt_nflen)
 217                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 218         }
 219
 220         skb_push(skb, sizeof(struct ipv6hdr));
 221         skb_reset_network_header(skb);
 222         hdr = ipv6_hdr(skb);
 223
 224         /*
 225          *      Fill in the IPv6 header
 226          */
 227         if (np)
 228                 hlimit = np->hop_limit;
 229         if (hlimit < 0)
 230                 hlimit = ip6_dst_hoplimit(dst);
 231
 232         ip6_flow_hdr(hdr, tclass, fl6->flowlabel);
 233
 234         hdr->payload_len = htons(seg_len);
 235         hdr->nexthdr = proto;
 236         hdr->hop_limit = hlimit;
 237
 238         hdr->saddr = fl6->saddr;
 239         hdr->daddr = *first_hop;
 240
 241         skb->priority = sk->sk_priority;
 242         skb->mark = sk->sk_mark;
 243
 244         mtu = dst_mtu(dst);
 245         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 246                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 247                               IPSTATS_MIB_OUT, skb->len);
 248                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 249                                dst->dev, dst_output);
 250         }
 251
 252         skb->dev = dst->dev;
 253         ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
 254         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 255         kfree_skb(skb);
 256         return -EMSGSIZE;
 257 }
 258
 259 EXPORT_SYMBOL(ip6_xmit);
 260
 261 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 262 {
 263         struct ip6_ra_chain *ra;
 264         struct sock *last = NULL;
 265
 266         read_lock(&ip6_ra_lock);
 267         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 268                 struct sock *sk = ra->sk;
 269                 if (sk && ra->sel == sel &&
 270                     (!sk->sk_bound_dev_if ||
 271                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 272                         if (last) {
 273                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 274                                 if (skb2)
 275                                         rawv6_rcv(last, skb2);
 276                         }
 277                         last = sk;
 278                 }
 279         }
 280
 281         if (last) {
 282                 rawv6_rcv(last, skb);
 283                 read_unlock(&ip6_ra_lock);
 284                 return 1;
 285         }
 286         read_unlock(&ip6_ra_lock);
 287         return 0;
 288 }
 289
 290 static int ip6_forward_proxy_check(struct sk_buff *skb)
 291 {
 292         struct ipv6hdr *hdr = ipv6_hdr(skb);
 293         u8 nexthdr = hdr->nexthdr;
 294         __be16 frag_off;
 295         int offset;
 296
 297         if (ipv6_ext_hdr(nexthdr)) {
 298                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 299                 if (offset < 0)
 300                         return 0;
 301         } else
 302                 offset = sizeof(struct ipv6hdr);
 303
 304         if (nexthdr == IPPROTO_ICMPV6) {
 305                 struct icmp6hdr *icmp6;
 306
 307                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 308                                          offset + 1 - skb->data)))
 309                         return 0;
 310
 311                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 312
 313                 switch (icmp6->icmp6_type) {
 314                 case NDISC_ROUTER_SOLICITATION:
 315                 case NDISC_ROUTER_ADVERTISEMENT:
 316                 case NDISC_NEIGHBOUR_SOLICITATION:
 317                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 318                 case NDISC_REDIRECT:
 319                         /* For reaction involving unicast neighbor discovery
 320                          * message destined to the proxied address, pass it to
 321                          * input function.
 322                          */
 323                         return 1;
 324                 default:
 325                         break;
 326                 }
 327         }
 328
 329         /*
 330          * The proxying router can't forward traffic sent to a link-local
 331          * address, so signal the sender and discard the packet. This
 332          * behavior is clarified by the MIPv6 specification.
 333          */
 334         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 335                 dst_link_failure(skb);
 336                 return -1;
 337         }
 338
 339         return 0;
 340 }
 341
 342 static inline int ip6_forward_finish(struct sk_buff *skb)
 343 {
 344         return dst_output(skb);
 345 }
 346
 347 int ip6_forward(struct sk_buff *skb)
 348 {
 349         struct dst_entry *dst = skb_dst(skb);
 350         struct ipv6hdr *hdr = ipv6_hdr(skb);
 351         struct inet6_skb_parm *opt = IP6CB(skb);
 352         struct net *net = dev_net(dst->dev);
 353         u32 mtu;
 354
 355         if (net->ipv6.devconf_all->forwarding == 0)
 356                 goto error;
 357
 358         if (skb_warn_if_lro(skb))
 359                 goto drop;
 360
 361         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 362                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 363                 goto drop;
 364         }
 365
 366         if (skb->pkt_type != PACKET_HOST)
 367                 goto drop;
 368
 369         skb_forward_csum(skb);
 370
 371         /*
 372          *      We DO NOT make any processing on
 373          *      RA packets, pushing them to user level AS IS
 374          *      without ane WARRANTY that application will be able
 375          *      to interpret them. The reason is that we
 376          *      cannot make anything clever here.
 377          *
 378          *      We are not end-node, so that if packet contains
 379          *      AH/ESP, we cannot make anything.
 380          *      Defragmentation also would be mistake, RA packets
 381          *      cannot be fragmented, because there is no warranty
 382          *      that different fragments will go along one path. --ANK
 383          */
 384         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 385                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 386                         return 0;
 387         }
 388
 389         /*
 390          *      check and decrement ttl
 391          */
 392         if (hdr->hop_limit <= 1) {
 393                 /* Force OUTPUT device used as source address */
 394                 skb->dev = dst->dev;
 395                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 396                 IP6_INC_STATS_BH(net,
 397                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 398
 399                 kfree_skb(skb);
 400                 return -ETIMEDOUT;
 401         }
 402
 403         /* XXX: idev->cnf.proxy_ndp? */
 404         if (net->ipv6.devconf_all->proxy_ndp &&
 405             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 406                 int proxied = ip6_forward_proxy_check(skb);
 407                 if (proxied > 0)
 408                         return ip6_input(skb);
 409                 else if (proxied < 0) {
 410                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 411                                       IPSTATS_MIB_INDISCARDS);
 412                         goto drop;
 413                 }
 414         }
 415
 416         if (!xfrm6_route_forward(skb)) {
 417                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 418                 goto drop;
 419         }
 420         dst = skb_dst(skb);
 421
 422         /* IPv6 specs say nothing about it, but it is clear that we cannot
 423            send redirects to source routed frames.
 424            We don't send redirects to frames decapsulated from IPsec.
 425          */
 426         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 427                 struct in6_addr *target = NULL;
 428                 struct inet_peer *peer;
 429                 struct rt6_info *rt;
 430
 431                 /*
 432                  *      incoming and outgoing devices are the same
 433                  *      send a redirect.
 434                  */
 435
 436                 rt = (struct rt6_info *) dst;
 437                 if (rt->rt6i_flags & RTF_GATEWAY)
 438                         target = &rt->rt6i_gateway;
 439                 else
 440                         target = &hdr->daddr;
 441
 442                 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
 443
 444                 /* Limit redirects both by destination (here)
 445                    and by source (inside ndisc_send_redirect)
 446                  */
 447                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 448                         ndisc_send_redirect(skb, target);
 449                 if (peer)
 450                         inet_putpeer(peer);
 451         } else {
 452                 int addrtype = ipv6_addr_type(&hdr->saddr);
 453
 454                 /* This check is security critical. */
 455                 if (addrtype == IPV6_ADDR_ANY ||
 456                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 457                         goto error;
 458                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 459                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 460                                     ICMPV6_NOT_NEIGHBOUR, 0);
 461                         goto error;
 462                 }
 463         }
 464
 465         mtu = dst_mtu(dst);
 466         if (mtu < IPV6_MIN_MTU)
 467                 mtu = IPV6_MIN_MTU;
 468
 469         if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
 470             (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
 471                 /* Again, force OUTPUT device used as source address */
 472                 skb->dev = dst->dev;
 473                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 474                 IP6_INC_STATS_BH(net,
 475                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 476                 IP6_INC_STATS_BH(net,
 477                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 478                 kfree_skb(skb);
 479                 return -EMSGSIZE;
 480         }
 481
 482         if (skb_cow(skb, dst->dev->hard_header_len)) {
 483                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 484                 goto drop;
 485         }
 486
 487         hdr = ipv6_hdr(skb);
 488
 489         /* Mangling hops number delayed to point after skb COW */
 490
 491         hdr->hop_limit--;
 492
 493         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 494         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 495         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 496                        ip6_forward_finish);
 497
 498 error:
 499         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 500 drop:
 501         kfree_skb(skb);
 502         return -EINVAL;
 503 }
 504
 505 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 506 {
 507         to->pkt_type = from->pkt_type;
 508         to->priority = from->priority;
 509         to->protocol = from->protocol;
 510         skb_dst_drop(to);
 511         skb_dst_set(to, dst_clone(skb_dst(from)));
 512         to->dev = from->dev;
 513         to->mark = from->mark;
 514
 515 #ifdef CONFIG_NET_SCHED
 516         to->tc_index = from->tc_index;
 517 #endif
 518         nf_copy(to, from);
 519 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
 520         to->nf_trace = from->nf_trace;
 521 #endif
 522         skb_copy_secmark(to, from);
 523 }
 524
 525 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 526 {
 527         struct sk_buff *frag;
 528         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 529         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 530         struct ipv6hdr *tmp_hdr;
 531         struct frag_hdr *fh;
 532         unsigned int mtu, hlen, left, len;
 533         int hroom, troom;
 534         __be32 frag_id = 0;
 535         int ptr, offset = 0, err=0;
 536         u8 *prevhdr, nexthdr = 0;
 537         struct net *net = dev_net(skb_dst(skb)->dev);
 538
 539         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 540         nexthdr = *prevhdr;
 541
 542         mtu = ip6_skb_dst_mtu(skb);
 543
 544         /* We must not fragment if the socket is set to force MTU discovery
 545          * or if the skb it not generated by a local socket.
 546          */
 547         if (unlikely(!skb->local_df && skb->len > mtu) ||
 548                      (IP6CB(skb)->frag_max_size &&
 549                       IP6CB(skb)->frag_max_size > mtu)) {
 550                 if (skb->sk && dst_allfrag(skb_dst(skb)))
 551                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 552
 553                 skb->dev = skb_dst(skb)->dev;
 554                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 555                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 556                               IPSTATS_MIB_FRAGFAILS);
 557                 kfree_skb(skb);
 558                 return -EMSGSIZE;
 559         }
 560
 561         if (np && np->frag_size < mtu) {
 562                 if (np->frag_size)
 563                         mtu = np->frag_size;
 564         }
 565         mtu -= hlen + sizeof(struct frag_hdr);
 566
 567         if (skb_has_frag_list(skb)) {
 568                 int first_len = skb_pagelen(skb);
 569                 struct sk_buff *frag2;
 570
 571                 if (first_len - hlen > mtu ||
 572                     ((first_len - hlen) & 7) ||
 573                     skb_cloned(skb))
 574                         goto slow_path;
 575
 576                 skb_walk_frags(skb, frag) {
 577                         /* Correct geometry. */
 578                         if (frag->len > mtu ||
 579                             ((frag->len & 7) && frag->next) ||
 580                             skb_headroom(frag) < hlen)
 581                                 goto slow_path_clean;
 582
 583                         /* Partially cloned skb? */
 584                         if (skb_shared(frag))
 585                                 goto slow_path_clean;
 586
 587                         BUG_ON(frag->sk);
 588                         if (skb->sk) {
 589                                 frag->sk = skb->sk;
 590                                 frag->destructor = sock_wfree;
 591                         }
 592                         skb->truesize -= frag->truesize;
 593                 }
 594
 595                 err = 0;
 596                 offset = 0;
 597                 frag = skb_shinfo(skb)->frag_list;
 598                 skb_frag_list_init(skb);
 599                 /* BUILD HEADER */
 600
 601                 *prevhdr = NEXTHDR_FRAGMENT;
 602                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 603                 if (!tmp_hdr) {
 604                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 605                                       IPSTATS_MIB_FRAGFAILS);
 606                         return -ENOMEM;
 607                 }
 608
 609                 __skb_pull(skb, hlen);
 610                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 611                 __skb_push(skb, hlen);
 612                 skb_reset_network_header(skb);
 613                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 614
 615                 ipv6_select_ident(fh, rt);
 616                 fh->nexthdr = nexthdr;
 617                 fh->reserved = 0;
 618                 fh->frag_off = htons(IP6_MF);
 619                 frag_id = fh->identification;
 620
 621                 first_len = skb_pagelen(skb);
 622                 skb->data_len = first_len - skb_headlen(skb);
 623                 skb->len = first_len;
 624                 ipv6_hdr(skb)->payload_len = htons(first_len -
 625                                                    sizeof(struct ipv6hdr));
 626
 627                 dst_hold(&rt->dst);
 628
 629                 for (;;) {
 630                         /* Prepare header of the next frame,
 631                          * before previous one went down. */
 632                         if (frag) {
 633                                 frag->ip_summed = CHECKSUM_NONE;
 634                                 skb_reset_transport_header(frag);
 635                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 636                                 __skb_push(frag, hlen);
 637                                 skb_reset_network_header(frag);
 638                                 memcpy(skb_network_header(frag), tmp_hdr,
 639                                        hlen);
 640                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 641                                 fh->nexthdr = nexthdr;
 642                                 fh->reserved = 0;
 643                                 fh->frag_off = htons(offset);
 644                                 if (frag->next != NULL)
 645                                         fh->frag_off |= htons(IP6_MF);
 646                                 fh->identification = frag_id;
 647                                 ipv6_hdr(frag)->payload_len =
 648                                                 htons(frag->len -
 649                                                       sizeof(struct ipv6hdr));
 650                                 ip6_copy_metadata(frag, skb);
 651                         }
 652
 653                         err = output(skb);
 654                         if(!err)
 655                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 656                                               IPSTATS_MIB_FRAGCREATES);
 657
 658                         if (err || !frag)
 659                                 break;
 660
 661                         skb = frag;
 662                         frag = skb->next;
 663                         skb->next = NULL;
 664                 }
 665
 666                 kfree(tmp_hdr);
 667
 668                 if (err == 0) {
 669                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 670                                       IPSTATS_MIB_FRAGOKS);
 671                         ip6_rt_put(rt);
 672                         return 0;
 673                 }
 674
 675                 while (frag) {
 676                         skb = frag->next;
 677                         kfree_skb(frag);
 678                         frag = skb;
 679                 }
 680
 681                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 682                               IPSTATS_MIB_FRAGFAILS);
 683                 ip6_rt_put(rt);
 684                 return err;
 685
 686 slow_path_clean:
 687                 skb_walk_frags(skb, frag2) {
 688                         if (frag2 == frag)
 689                                 break;
 690                         frag2->sk = NULL;
 691                         frag2->destructor = NULL;
 692                         skb->truesize += frag2->truesize;
 693                 }
 694         }
 695
 696 slow_path:
 697         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
 698             skb_checksum_help(skb))
 699                 goto fail;
 700
 701         left = skb->len - hlen;         /* Space per frame */
 702         ptr = hlen;                     /* Where to start from */
 703
 704         /*
 705          *      Fragment the datagram.
 706          */
 707
 708         *prevhdr = NEXTHDR_FRAGMENT;
 709         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 710         troom = rt->dst.dev->needed_tailroom;
 711
 712         /*
 713          *      Keep copying data until we run out.
 714          */
 715         while(left > 0) {
 716                 len = left;
 717                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 718                 if (len > mtu)
 719                         len = mtu;
 720                 /* IF: we are not sending up to and including the packet end
 721                    then align the next start on an eight byte boundary */
 722                 if (len < left) {
 723                         len &= ~7;
 724                 }
 725                 /*
 726                  *      Allocate buffer.
 727                  */
 728
 729                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 730                                       hroom + troom, GFP_ATOMIC)) == NULL) {
 731                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 732                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 733                                       IPSTATS_MIB_FRAGFAILS);
 734                         err = -ENOMEM;
 735                         goto fail;
 736                 }
 737
 738                 /*
 739                  *      Set up data on packet
 740                  */
 741
 742                 ip6_copy_metadata(frag, skb);
 743                 skb_reserve(frag, hroom);
 744                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 745                 skb_reset_network_header(frag);
 746                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 747                 frag->transport_header = (frag->network_header + hlen +
 748                                           sizeof(struct frag_hdr));
 749
 750                 /*
 751                  *      Charge the memory for the fragment to any owner
 752                  *      it might possess
 753                  */
 754                 if (skb->sk)
 755                         skb_set_owner_w(frag, skb->sk);
 756
 757                 /*
 758                  *      Copy the packet header into the new buffer.
 759                  */
 760                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 761
 762                 /*
 763                  *      Build fragment header.
 764                  */
 765                 fh->nexthdr = nexthdr;
 766                 fh->reserved = 0;
 767                 if (!frag_id) {
 768                         ipv6_select_ident(fh, rt);
 769                         frag_id = fh->identification;
 770                 } else
 771                         fh->identification = frag_id;
 772
 773                 /*
 774                  *      Copy a block of the IP datagram.
 775                  */
 776                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 777                         BUG();
 778                 left -= len;
 779
 780                 fh->frag_off = htons(offset);
 781                 if (left > 0)
 782                         fh->frag_off |= htons(IP6_MF);
 783                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 784                                                     sizeof(struct ipv6hdr));
 785
 786                 ptr += len;
 787                 offset += len;
 788
 789                 /*
 790                  *      Put this fragment into the sending queue.
 791                  */
 792                 err = output(frag);
 793                 if (err)
 794                         goto fail;
 795
 796                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 797                               IPSTATS_MIB_FRAGCREATES);
 798         }
 799         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 800                       IPSTATS_MIB_FRAGOKS);
 801         consume_skb(skb);
 802         return err;
 803
 804 fail:
 805         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 806                       IPSTATS_MIB_FRAGFAILS);
 807         kfree_skb(skb);
 808         return err;
 809 }
 810
 811 static inline int ip6_rt_check(const struct rt6key *rt_key,
 812                                const struct in6_addr *fl_addr,
 813                                const struct in6_addr *addr_cache)
 814 {
 815         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 816                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 817 }
 818
 819 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 820                                           struct dst_entry *dst,
 821                                           const struct flowi6 *fl6)
 822 {
 823         struct ipv6_pinfo *np = inet6_sk(sk);
 824         struct rt6_info *rt;
 825
 826         if (!dst)
 827                 goto out;
 828
 829         if (dst->ops->family != AF_INET6) {
 830                 dst_release(dst);
 831                 return NULL;
 832         }
 833
 834         rt = (struct rt6_info *)dst;
 835         /* Yes, checking route validity in not connected
 836          * case is not very simple. Take into account,
 837          * that we do not support routing by source, TOS,
 838          * and MSG_DONTROUTE            --ANK (980726)
 839          *
 840          * 1. ip6_rt_check(): If route was host route,
 841          *    check that cached destination is current.
 842          *    If it is network route, we still may
 843          *    check its validity using saved pointer
 844          *    to the last used address: daddr_cache.
 845          *    We do not want to save whole address now,
 846          *    (because main consumer of this service
 847          *    is tcp, which has not this problem),
 848          *    so that the last trick works only on connected
 849          *    sockets.
 850          * 2. oif also should be the same.
 851          */
 852         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 853 #ifdef CONFIG_IPV6_SUBTREES
 854             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 855 #endif
 856             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 857                 dst_release(dst);
 858                 dst = NULL;
 859         }
 860
 861 out:
 862         return dst;
 863 }
 864
 865 static int ip6_dst_lookup_tail(struct sock *sk,
 866                                struct dst_entry **dst, struct flowi6 *fl6)
 867 {
 868         struct net *net = sock_net(sk);
 869 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 870         struct neighbour *n;
 871         struct rt6_info *rt;
 872 #endif
 873         int err;
 874
 875         if (*dst == NULL)
 876                 *dst = ip6_route_output(net, sk, fl6);
 877
 878         if ((err = (*dst)->error))
 879                 goto out_err_release;
 880
 881         if (ipv6_addr_any(&fl6->saddr)) {
 882                 struct rt6_info *rt = (struct rt6_info *) *dst;
 883                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 884                                           sk ? inet6_sk(sk)->srcprefs : 0,
 885                                           &fl6->saddr);
 886                 if (err)
 887                         goto out_err_release;
 888         }
 889
 890 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 891         /*
 892          * Here if the dst entry we've looked up
 893          * has a neighbour entry that is in the INCOMPLETE
 894          * state and the src address from the flow is
 895          * marked as OPTIMISTIC, we release the found
 896          * dst entry and replace it instead with the
 897          * dst entry of the nexthop router
 898          */
 899         rt = (struct rt6_info *) *dst;
 900         rcu_read_lock_bh();
 901         n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr));
 902         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 903         rcu_read_unlock_bh();
 904
 905         if (err) {
 906                 struct inet6_ifaddr *ifp;
 907                 struct flowi6 fl_gw6;
 908                 int redirect;
 909
 910                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 911                                       (*dst)->dev, 1);
 912
 913                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 914                 if (ifp)
 915                         in6_ifa_put(ifp);
 916
 917                 if (redirect) {
 918                         /*
 919                          * We need to get the dst entry for the
 920                          * default router instead
 921                          */
 922                         dst_release(*dst);
 923                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
 924                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
 925                         *dst = ip6_route_output(net, sk, &fl_gw6);
 926                         if ((err = (*dst)->error))
 927                                 goto out_err_release;
 928                 }
 929         }
 930 #endif
 931
 932         return 0;
 933
 934 out_err_release:
 935         if (err == -ENETUNREACH)
 936                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 937         dst_release(*dst);
 938         *dst = NULL;
 939         return err;
 940 }
 941
 942 /**
 943  *      ip6_dst_lookup - perform route lookup on flow
 944  *      @sk: socket which provides route info
 945  *      @dst: pointer to dst_entry * for result
 946  *      @fl6: flow to lookup
 947  *
 948  *      This function performs a route lookup on the given flow.
 949  *
 950  *      It returns zero on success, or a standard errno code on error.
 951  */
 952 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
 953 {
 954         *dst = NULL;
 955         return ip6_dst_lookup_tail(sk, dst, fl6);
 956 }
 957 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
 958
 959 /**
 960  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
 961  *      @sk: socket which provides route info
 962  *      @fl6: flow to lookup
 963  *      @final_dst: final destination address for ipsec lookup
 964  *      @can_sleep: we are in a sleepable context
 965  *
 966  *      This function performs a route lookup on the given flow.
 967  *
 968  *      It returns a valid dst pointer on success, or a pointer encoded
 969  *      error code.
 970  */
 971 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
 972                                       const struct in6_addr *final_dst,
 973                                       bool can_sleep)
 974 {
 975         struct dst_entry *dst = NULL;
 976         int err;
 977
 978         err = ip6_dst_lookup_tail(sk, &dst, fl6);
 979         if (err)
 980                 return ERR_PTR(err);
 981         if (final_dst)
 982                 fl6->daddr = *final_dst;
 983         if (can_sleep)
 984                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
 985
 986         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
 987 }
 988 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
 989
 990 /**
 991  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
 992  *      @sk: socket which provides the dst cache and route info
 993  *      @fl6: flow to lookup
 994  *      @final_dst: final destination address for ipsec lookup
 995  *      @can_sleep: we are in a sleepable context
 996  *
 997  *      This function performs a route lookup on the given flow with the
 998  *      possibility of using the cached route in the socket if it is valid.
 999  *      It will take the socket dst lock when operating on the dst cache.
1000  *      As a result, this function can only be used in process context.
1001  *
1002  *      It returns a valid dst pointer on success, or a pointer encoded
1003  *      error code.
1004  */
1005 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1006                                          const struct in6_addr *final_dst,
1007                                          bool can_sleep)
1008 {
1009         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1010         int err;
1011
1012         dst = ip6_sk_dst_check(sk, dst, fl6);
1013
1014         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1015         if (err)
1016                 return ERR_PTR(err);
1017         if (final_dst)
1018                 fl6->daddr = *final_dst;
1019         if (can_sleep)
1020                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1021
1022         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1023 }
1024 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1025
1026 static inline int ip6_ufo_append_data(struct sock *sk,
1027                         int getfrag(void *from, char *to, int offset, int len,
1028                         int odd, struct sk_buff *skb),
1029                         void *from, int length, int hh_len, int fragheaderlen,
1030                         int transhdrlen, int mtu,unsigned int flags,
1031                         struct rt6_info *rt)
1032
1033 {
1034         struct sk_buff *skb;
1035         int err;
1036
1037         /* There is support for UDP large send offload by network
1038          * device, so create one single skb packet containing complete
1039          * udp datagram
1040          */
1041         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1042                 skb = sock_alloc_send_skb(sk,
1043                         hh_len + fragheaderlen + transhdrlen + 20,
1044                         (flags & MSG_DONTWAIT), &err);
1045                 if (skb == NULL)
1046                         return err;
1047
1048                 /* reserve space for Hardware header */
1049                 skb_reserve(skb, hh_len);
1050
1051                 /* create space for UDP/IP header */
1052                 skb_put(skb,fragheaderlen + transhdrlen);
1053
1054                 /* initialize network header pointer */
1055                 skb_reset_network_header(skb);
1056
1057                 /* initialize protocol header pointer */
1058                 skb->transport_header = skb->network_header + fragheaderlen;
1059
1060                 skb->ip_summed = CHECKSUM_PARTIAL;
1061                 skb->csum = 0;
1062         }
1063
1064         err = skb_append_datato_frags(sk,skb, getfrag, from,
1065                                       (length - transhdrlen));
1066         if (!err) {
1067                 struct frag_hdr fhdr;
1068
1069                 /* Specify the length of each IPv6 datagram fragment.
1070                  * It has to be a multiple of 8.
1071                  */
1072                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1073                                              sizeof(struct frag_hdr)) & ~7;
1074                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1075                 ipv6_select_ident(&fhdr, rt);
1076                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1077                 __skb_queue_tail(&sk->sk_write_queue, skb);
1078
1079                 return 0;
1080         }
1081         /* There is not enough support do UPD LSO,
1082          * so follow normal path
1083          */
1084         kfree_skb(skb);
1085
1086         return err;
1087 }
1088
1089 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1090                                                gfp_t gfp)
1091 {
1092         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1093 }
1094
1095 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1096                                                 gfp_t gfp)
1097 {
1098         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1099 }
1100
1101 static void ip6_append_data_mtu(unsigned int *mtu,
1102                                 int *maxfraglen,
1103                                 unsigned int fragheaderlen,
1104                                 struct sk_buff *skb,
1105                                 struct rt6_info *rt,
1106                                 bool pmtuprobe)
1107 {
1108         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1109                 if (skb == NULL) {
1110                         /* first fragment, reserve header_len */
1111                         *mtu = *mtu - rt->dst.header_len;
1112
1113                 } else {
1114                         /*
1115                          * this fragment is not first, the headers
1116                          * space is regarded as data space.
1117                          */
1118                         *mtu = min(*mtu, pmtuprobe ?
1119                                    rt->dst.dev->mtu :
1120                                    dst_mtu(rt->dst.path));
1121                 }
1122                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1123                               + fragheaderlen - sizeof(struct frag_hdr);
1124         }
1125 }
1126
1127 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1128         int offset, int len, int odd, struct sk_buff *skb),
1129         void *from, int length, int transhdrlen,
1130         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1131         struct rt6_info *rt, unsigned int flags, int dontfrag)
1132 {
1133         struct inet_sock *inet = inet_sk(sk);
1134         struct ipv6_pinfo *np = inet6_sk(sk);
1135         struct inet_cork *cork;
1136         struct sk_buff *skb, *skb_prev = NULL;
1137         unsigned int maxfraglen, fragheaderlen, mtu;
1138         int exthdrlen;
1139         int dst_exthdrlen;
1140         int hh_len;
1141         int copy;
1142         int err;
1143         int offset = 0;
1144         __u8 tx_flags = 0;
1145
1146         if (flags&MSG_PROBE)
1147                 return 0;
1148         cork = &inet->cork.base;
1149         if (skb_queue_empty(&sk->sk_write_queue)) {
1150                 /*
1151                  * setup for corking
1152                  */
1153                 if (opt) {
1154                         if (WARN_ON(np->cork.opt))
1155                                 return -EINVAL;
1156
1157                         np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1158                         if (unlikely(np->cork.opt == NULL))
1159                                 return -ENOBUFS;
1160
1161                         np->cork.opt->tot_len = opt->tot_len;
1162                         np->cork.opt->opt_flen = opt->opt_flen;
1163                         np->cork.opt->opt_nflen = opt->opt_nflen;
1164
1165                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1166                                                             sk->sk_allocation);
1167                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1168                                 return -ENOBUFS;
1169
1170                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1171                                                             sk->sk_allocation);
1172                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1173                                 return -ENOBUFS;
1174
1175                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1176                                                            sk->sk_allocation);
1177                         if (opt->hopopt && !np->cork.opt->hopopt)
1178                                 return -ENOBUFS;
1179
1180                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1181                                                             sk->sk_allocation);
1182                         if (opt->srcrt && !np->cork.opt->srcrt)
1183                                 return -ENOBUFS;
1184
1185                         /* need source address above miyazawa*/
1186                 }
1187                 dst_hold(&rt->dst);
1188                 cork->dst = &rt->dst;
1189                 inet->cork.fl.u.ip6 = *fl6;
1190                 np->cork.hop_limit = hlimit;
1191                 np->cork.tclass = tclass;
1192                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1193                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1194                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1195                 else
1196                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1197                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1198                 if (np->frag_size < mtu) {
1199                         if (np->frag_size)
1200                                 mtu = np->frag_size;
1201                 }
1202                 cork->fragsize = mtu;
1203                 if (dst_allfrag(rt->dst.path))
1204                         cork->flags |= IPCORK_ALLFRAG;
1205                 cork->length = 0;
1206                 exthdrlen = (opt ? opt->opt_flen : 0);
1207                 length += exthdrlen;
1208                 transhdrlen += exthdrlen;
1209                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1210         } else {
1211                 rt = (struct rt6_info *)cork->dst;
1212                 fl6 = &inet->cork.fl.u.ip6;
1213                 opt = np->cork.opt;
1214                 transhdrlen = 0;
1215                 exthdrlen = 0;
1216                 dst_exthdrlen = 0;
1217                 mtu = cork->fragsize;
1218         }
1219
1220         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1221
1222         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1223                         (opt ? opt->opt_nflen : 0);
1224         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1225
1226         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1227                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1228                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1229                         return -EMSGSIZE;
1230                 }
1231         }
1232
1233         /* For UDP, check if TX timestamp is enabled */
1234         if (sk->sk_type == SOCK_DGRAM)
1235                 sock_tx_timestamp(sk, &tx_flags);
1236
1237         /*
1238          * Let's try using as much space as possible.
1239          * Use MTU if total length of the message fits into the MTU.
1240          * Otherwise, we need to reserve fragment header and
1241          * fragment alignment (= 8-15 octects, in total).
1242          *
1243          * Note that we may need to "move" the data from the tail of
1244          * of the buffer to the new fragment when we split
1245          * the message.
1246          *
1247          * FIXME: It may be fragmented into multiple chunks
1248          *        at once if non-fragmentable extension headers
1249          *        are too large.
1250          * --yoshfuji
1251          */
1252
1253         cork->length += length;
1254         if (length > mtu) {
1255                 int proto = sk->sk_protocol;
1256                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1257                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1258                         return -EMSGSIZE;
1259                 }
1260
1261                 if (proto == IPPROTO_UDP &&
1262                     (rt->dst.dev->features & NETIF_F_UFO)) {
1263
1264                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1265                                                   hh_len, fragheaderlen,
1266                                                   transhdrlen, mtu, flags, rt);
1267                         if (err)
1268                                 goto error;
1269                         return 0;
1270                 }
1271         }
1272
1273         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1274                 goto alloc_new_skb;
1275
1276         while (length > 0) {
1277                 /* Check if the remaining data fits into current packet. */
1278                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1279                 if (copy < length)
1280                         copy = maxfraglen - skb->len;
1281
1282                 if (copy <= 0) {
1283                         char *data;
1284                         unsigned int datalen;
1285                         unsigned int fraglen;
1286                         unsigned int fraggap;
1287                         unsigned int alloclen;
1288 alloc_new_skb:
1289                         /* There's no room in the current skb */
1290                         if (skb)
1291                                 fraggap = skb->len - maxfraglen;
1292                         else
1293                                 fraggap = 0;
1294                         /* update mtu and maxfraglen if necessary */
1295                         if (skb == NULL || skb_prev == NULL)
1296                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1297                                                     fragheaderlen, skb, rt,
1298                                                     np->pmtudisc ==
1299                                                     IPV6_PMTUDISC_PROBE);
1300
1301                         skb_prev = skb;
1302
1303                         /*
1304                          * If remaining data exceeds the mtu,
1305                          * we know we need more fragment(s).
1306                          */
1307                         datalen = length + fraggap;
1308
1309                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1310                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1311                         if ((flags & MSG_MORE) &&
1312                             !(rt->dst.dev->features&NETIF_F_SG))
1313                                 alloclen = mtu;
1314                         else
1315                                 alloclen = datalen + fragheaderlen;
1316
1317                         alloclen += dst_exthdrlen;
1318
1319                         if (datalen != length + fraggap) {
1320                                 /*
1321                                  * this is not the last fragment, the trailer
1322                                  * space is regarded as data space.
1323                                  */
1324                                 datalen += rt->dst.trailer_len;
1325                         }
1326
1327                         alloclen += rt->dst.trailer_len;
1328                         fraglen = datalen + fragheaderlen;
1329
1330                         /*
1331                          * We just reserve space for fragment header.
1332                          * Note: this may be overallocation if the message
1333                          * (without MSG_MORE) fits into the MTU.
1334                          */
1335                         alloclen += sizeof(struct frag_hdr);
1336
1337                         if (transhdrlen) {
1338                                 skb = sock_alloc_send_skb(sk,
1339                                                 alloclen + hh_len,
1340                                                 (flags & MSG_DONTWAIT), &err);
1341                         } else {
1342                                 skb = NULL;
1343                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1344                                     2 * sk->sk_sndbuf)
1345                                         skb = sock_wmalloc(sk,
1346                                                            alloclen + hh_len, 1,
1347                                                            sk->sk_allocation);
1348                                 if (unlikely(skb == NULL))
1349                                         err = -ENOBUFS;
1350                                 else {
1351                                         /* Only the initial fragment
1352                                          * is time stamped.
1353                                          */
1354                                         tx_flags = 0;
1355                                 }
1356                         }
1357                         if (skb == NULL)
1358                                 goto error;
1359                         /*
1360                          *      Fill in the control structures
1361                          */
1362                         skb->ip_summed = CHECKSUM_NONE;
1363                         skb->csum = 0;
1364                         /* reserve for fragmentation and ipsec header */
1365                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1366                                     dst_exthdrlen);
1367
1368                         if (sk->sk_type == SOCK_DGRAM)
1369                                 skb_shinfo(skb)->tx_flags = tx_flags;
1370
1371                         /*
1372                          *      Find where to start putting bytes
1373                          */
1374                         data = skb_put(skb, fraglen);
1375                         skb_set_network_header(skb, exthdrlen);
1376                         data += fragheaderlen;
1377                         skb->transport_header = (skb->network_header +
1378                                                  fragheaderlen);
1379                         if (fraggap) {
1380                                 skb->csum = skb_copy_and_csum_bits(
1381                                         skb_prev, maxfraglen,
1382                                         data + transhdrlen, fraggap, 0);
1383                                 skb_prev->csum = csum_sub(skb_prev->csum,
1384                                                           skb->csum);
1385                                 data += fraggap;
1386                                 pskb_trim_unique(skb_prev, maxfraglen);
1387                         }
1388                         copy = datalen - transhdrlen - fraggap;
1389
1390                         if (copy < 0) {
1391                                 err = -EINVAL;
1392                                 kfree_skb(skb);
1393                                 goto error;
1394                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1395                                 err = -EFAULT;
1396                                 kfree_skb(skb);
1397                                 goto error;
1398                         }
1399
1400                         offset += copy;
1401                         length -= datalen - fraggap;
1402                         transhdrlen = 0;
1403                         exthdrlen = 0;
1404                         dst_exthdrlen = 0;
1405
1406                         /*
1407                          * Put the packet on the pending queue
1408                          */
1409                         __skb_queue_tail(&sk->sk_write_queue, skb);
1410                         continue;
1411                 }
1412
1413                 if (copy > length)
1414                         copy = length;
1415
1416                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1417                         unsigned int off;
1418
1419                         off = skb->len;
1420                         if (getfrag(from, skb_put(skb, copy),
1421                                                 offset, copy, off, skb) < 0) {
1422                                 __skb_trim(skb, off);
1423                                 err = -EFAULT;
1424                                 goto error;
1425                         }
1426                 } else {
1427                         int i = skb_shinfo(skb)->nr_frags;
1428                         struct page_frag *pfrag = sk_page_frag(sk);
1429
1430                         err = -ENOMEM;
1431                         if (!sk_page_frag_refill(sk, pfrag))
1432                                 goto error;
1433
1434                         if (!skb_can_coalesce(skb, i, pfrag->page,
1435                                               pfrag->offset)) {
1436                                 err = -EMSGSIZE;
1437                                 if (i == MAX_SKB_FRAGS)
1438                                         goto error;
1439
1440                                 __skb_fill_page_desc(skb, i, pfrag->page,
1441                                                      pfrag->offset, 0);
1442                                 skb_shinfo(skb)->nr_frags = ++i;
1443                                 get_page(pfrag->page);
1444                         }
1445                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1446                         if (getfrag(from,
1447                                     page_address(pfrag->page) + pfrag->offset,
1448                                     offset, copy, skb->len, skb) < 0)
1449                                 goto error_efault;
1450
1451                         pfrag->offset += copy;
1452                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1453                         skb->len += copy;
1454                         skb->data_len += copy;
1455                         skb->truesize += copy;
1456                         atomic_add(copy, &sk->sk_wmem_alloc);
1457                 }
1458                 offset += copy;
1459                 length -= copy;
1460         }
1461
1462         return 0;
1463
1464 error_efault:
1465         err = -EFAULT;
1466 error:
1467         cork->length -= length;
1468         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1469         return err;
1470 }
1471 EXPORT_SYMBOL_GPL(ip6_append_data);
1472
1473 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1474 {
1475         if (np->cork.opt) {
1476                 kfree(np->cork.opt->dst0opt);
1477                 kfree(np->cork.opt->dst1opt);
1478                 kfree(np->cork.opt->hopopt);
1479                 kfree(np->cork.opt->srcrt);
1480                 kfree(np->cork.opt);
1481                 np->cork.opt = NULL;
1482         }
1483
1484         if (inet->cork.base.dst) {
1485                 dst_release(inet->cork.base.dst);
1486                 inet->cork.base.dst = NULL;
1487                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1488         }
1489         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1490 }
1491
1492 int ip6_push_pending_frames(struct sock *sk)
1493 {
1494         struct sk_buff *skb, *tmp_skb;
1495         struct sk_buff **tail_skb;
1496         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1497         struct inet_sock *inet = inet_sk(sk);
1498         struct ipv6_pinfo *np = inet6_sk(sk);
1499         struct net *net = sock_net(sk);
1500         struct ipv6hdr *hdr;
1501         struct ipv6_txoptions *opt = np->cork.opt;
1502         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1503         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1504         unsigned char proto = fl6->flowi6_proto;
1505         int err = 0;
1506
1507         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1508                 goto out;
1509         tail_skb = &(skb_shinfo(skb)->frag_list);
1510
1511         /* move skb->data to ip header from ext header */
1512         if (skb->data < skb_network_header(skb))
1513                 __skb_pull(skb, skb_network_offset(skb));
1514         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1515                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1516                 *tail_skb = tmp_skb;
1517                 tail_skb = &(tmp_skb->next);
1518                 skb->len += tmp_skb->len;
1519                 skb->data_len += tmp_skb->len;
1520                 skb->truesize += tmp_skb->truesize;
1521                 tmp_skb->destructor = NULL;
1522                 tmp_skb->sk = NULL;
1523         }
1524
1525         /* Allow local fragmentation. */
1526         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1527                 skb->local_df = 1;
1528
1529         *final_dst = fl6->daddr;
1530         __skb_pull(skb, skb_network_header_len(skb));
1531         if (opt && opt->opt_flen)
1532                 ipv6_push_frag_opts(skb, opt, &proto);
1533         if (opt && opt->opt_nflen)
1534                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1535
1536         skb_push(skb, sizeof(struct ipv6hdr));
1537         skb_reset_network_header(skb);
1538         hdr = ipv6_hdr(skb);
1539
1540         ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel);
1541         hdr->hop_limit = np->cork.hop_limit;
1542         hdr->nexthdr = proto;
1543         hdr->saddr = fl6->saddr;
1544         hdr->daddr = *final_dst;
1545
1546         skb->priority = sk->sk_priority;
1547         skb->mark = sk->sk_mark;
1548
1549         skb_dst_set(skb, dst_clone(&rt->dst));
1550         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1551         if (proto == IPPROTO_ICMPV6) {
1552                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1553
1554                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1555                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1556         }
1557
1558         err = ip6_local_out(skb);
1559         if (err) {
1560                 if (err > 0)
1561                         err = net_xmit_errno(err);
1562                 if (err)
1563                         goto error;
1564         }
1565
1566 out:
1567         ip6_cork_release(inet, np);
1568         return err;
1569 error:
1570         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1571         goto out;
1572 }
1573 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1574
1575 void ip6_flush_pending_frames(struct sock *sk)
1576 {
1577         struct sk_buff *skb;
1578
1579         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1580                 if (skb_dst(skb))
1581                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1582                                       IPSTATS_MIB_OUTDISCARDS);
1583                 kfree_skb(skb);
1584         }
1585
1586         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1587 }
1588 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);