net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 int __ip6_local_out(struct sk_buff *skb)
  62 {
  63         int len;
  64
  65         len = skb->len - sizeof(struct ipv6hdr);
  66         if (len > IPV6_MAXPLEN)
  67                 len = 0;
  68         ipv6_hdr(skb)->payload_len = htons(len);
  69
  70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  71                        skb_dst(skb)->dev, dst_output);
  72 }
  73
  74 int ip6_local_out(struct sk_buff *skb)
  75 {
  76         int err;
  77
  78         err = __ip6_local_out(skb);
  79         if (likely(err == 1))
  80                 err = dst_output(skb);
  81
  82         return err;
  83 }
  84 EXPORT_SYMBOL_GPL(ip6_local_out);
  85
  86 /* dev_loopback_xmit for use with netfilter. */
  87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
  88 {
  89         skb_reset_mac_header(newskb);
  90         __skb_pull(newskb, skb_network_offset(newskb));
  91         newskb->pkt_type = PACKET_LOOPBACK;
  92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
  93         WARN_ON(!skb_dst(newskb));
  94
  95         netif_rx_ni(newskb);
  96         return 0;
  97 }
  98
  99 static int ip6_finish_output2(struct sk_buff *skb)
 100 {
 101         struct dst_entry *dst = skb_dst(skb);
 102         struct net_device *dev = dst->dev;
 103         struct neighbour *neigh;
 104
 105         skb->protocol = htons(ETH_P_IPV6);
 106         skb->dev = dev;
 107
 108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 110
 111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
 112                     ((mroute6_socket(dev_net(dev), skb) &&
 113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 115                                          &ipv6_hdr(skb)->saddr))) {
 116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 117
 118                         /* Do not check for IFF_ALLMULTI; multicast routing
 119                            is not supported in any case.
 120                          */
 121                         if (newskb)
 122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 123                                         newskb, NULL, newskb->dev,
 124                                         ip6_dev_loopback_xmit);
 125
 126                         if (ipv6_hdr(skb)->hop_limit == 0) {
 127                                 IP6_INC_STATS(dev_net(dev), idev,
 128                                               IPSTATS_MIB_OUTDISCARDS);
 129                                 kfree_skb(skb);
 130                                 return 0;
 131                         }
 132                 }
 133
 134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 135                                 skb->len);
 136         }
 137
 138         rcu_read_lock();
 139         neigh = dst_get_neighbour(dst);
 140         if (neigh) {
 141                 int res = neigh_output(neigh, skb);
 142
 143                 rcu_read_unlock();
 144                 return res;
 145         }
 146         rcu_read_unlock();
 147         IP6_INC_STATS_BH(dev_net(dst->dev),
 148                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 149         kfree_skb(skb);
 150         return -EINVAL;
 151 }
 152
 153 static int ip6_finish_output(struct sk_buff *skb)
 154 {
 155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 156             dst_allfrag(skb_dst(skb)))
 157                 return ip6_fragment(skb, ip6_finish_output2);
 158         else
 159                 return ip6_finish_output2(skb);
 160 }
 161
 162 int ip6_output(struct sk_buff *skb)
 163 {
 164         struct net_device *dev = skb_dst(skb)->dev;
 165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 166         if (unlikely(idev->cnf.disable_ipv6)) {
 167                 IP6_INC_STATS(dev_net(dev), idev,
 168                               IPSTATS_MIB_OUTDISCARDS);
 169                 kfree_skb(skb);
 170                 return 0;
 171         }
 172
 173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 174                             ip6_finish_output,
 175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 176 }
 177
 178 /*
 179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 180  */
 181
 182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 183              struct ipv6_txoptions *opt)
 184 {
 185         struct net *net = sock_net(sk);
 186         struct ipv6_pinfo *np = inet6_sk(sk);
 187         struct in6_addr *first_hop = &fl6->daddr;
 188         struct dst_entry *dst = skb_dst(skb);
 189         struct ipv6hdr *hdr;
 190         u8  proto = fl6->flowi6_proto;
 191         int seg_len = skb->len;
 192         int hlimit = -1;
 193         int tclass = 0;
 194         u32 mtu;
 195
 196         if (opt) {
 197                 unsigned int head_room;
 198
 199                 /* First: exthdrs may take lots of space (~8K for now)
 200                    MAX_HEADER is not enough.
 201                  */
 202                 head_room = opt->opt_nflen + opt->opt_flen;
 203                 seg_len += head_room;
 204                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 205
 206                 if (skb_headroom(skb) < head_room) {
 207                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 208                         if (skb2 == NULL) {
 209                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 210                                               IPSTATS_MIB_OUTDISCARDS);
 211                                 kfree_skb(skb);
 212                                 return -ENOBUFS;
 213                         }
 214                         kfree_skb(skb);
 215                         skb = skb2;
 216                         skb_set_owner_w(skb, sk);
 217                 }
 218                 if (opt->opt_flen)
 219                         ipv6_push_frag_opts(skb, opt, &proto);
 220                 if (opt->opt_nflen)
 221                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 222         }
 223
 224         skb_push(skb, sizeof(struct ipv6hdr));
 225         skb_reset_network_header(skb);
 226         hdr = ipv6_hdr(skb);
 227
 228         /*
 229          *      Fill in the IPv6 header
 230          */
 231         if (np) {
 232                 tclass = np->tclass;
 233                 hlimit = np->hop_limit;
 234         }
 235         if (hlimit < 0)
 236                 hlimit = ip6_dst_hoplimit(dst);
 237
 238         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
 239
 240         hdr->payload_len = htons(seg_len);
 241         hdr->nexthdr = proto;
 242         hdr->hop_limit = hlimit;
 243
 244         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
 245         ipv6_addr_copy(&hdr->daddr, first_hop);
 246
 247         skb->priority = sk->sk_priority;
 248         skb->mark = sk->sk_mark;
 249
 250         mtu = dst_mtu(dst);
 251         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 252                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 253                               IPSTATS_MIB_OUT, skb->len);
 254                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 255                                dst->dev, dst_output);
 256         }
 257
 258         if (net_ratelimit())
 259                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 260         skb->dev = dst->dev;
 261         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 262         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 263         kfree_skb(skb);
 264         return -EMSGSIZE;
 265 }
 266
 267 EXPORT_SYMBOL(ip6_xmit);
 268
 269 /*
 270  *      To avoid extra problems ND packets are send through this
 271  *      routine. It's code duplication but I really want to avoid
 272  *      extra checks since ipv6_build_header is used by TCP (which
 273  *      is for us performance critical)
 274  */
 275
 276 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 277                const struct in6_addr *saddr, const struct in6_addr *daddr,
 278                int proto, int len)
 279 {
 280         struct ipv6_pinfo *np = inet6_sk(sk);
 281         struct ipv6hdr *hdr;
 282
 283         skb->protocol = htons(ETH_P_IPV6);
 284         skb->dev = dev;
 285
 286         skb_reset_network_header(skb);
 287         skb_put(skb, sizeof(struct ipv6hdr));
 288         hdr = ipv6_hdr(skb);
 289
 290         *(__be32*)hdr = htonl(0x60000000);
 291
 292         hdr->payload_len = htons(len);
 293         hdr->nexthdr = proto;
 294         hdr->hop_limit = np->hop_limit;
 295
 296         ipv6_addr_copy(&hdr->saddr, saddr);
 297         ipv6_addr_copy(&hdr->daddr, daddr);
 298
 299         return 0;
 300 }
 301
 302 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 303 {
 304         struct ip6_ra_chain *ra;
 305         struct sock *last = NULL;
 306
 307         read_lock(&ip6_ra_lock);
 308         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 309                 struct sock *sk = ra->sk;
 310                 if (sk && ra->sel == sel &&
 311                     (!sk->sk_bound_dev_if ||
 312                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 313                         if (last) {
 314                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 315                                 if (skb2)
 316                                         rawv6_rcv(last, skb2);
 317                         }
 318                         last = sk;
 319                 }
 320         }
 321
 322         if (last) {
 323                 rawv6_rcv(last, skb);
 324                 read_unlock(&ip6_ra_lock);
 325                 return 1;
 326         }
 327         read_unlock(&ip6_ra_lock);
 328         return 0;
 329 }
 330
 331 static int ip6_forward_proxy_check(struct sk_buff *skb)
 332 {
 333         struct ipv6hdr *hdr = ipv6_hdr(skb);
 334         u8 nexthdr = hdr->nexthdr;
 335         int offset;
 336
 337         if (ipv6_ext_hdr(nexthdr)) {
 338                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 339                 if (offset < 0)
 340                         return 0;
 341         } else
 342                 offset = sizeof(struct ipv6hdr);
 343
 344         if (nexthdr == IPPROTO_ICMPV6) {
 345                 struct icmp6hdr *icmp6;
 346
 347                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 348                                          offset + 1 - skb->data)))
 349                         return 0;
 350
 351                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 352
 353                 switch (icmp6->icmp6_type) {
 354                 case NDISC_ROUTER_SOLICITATION:
 355                 case NDISC_ROUTER_ADVERTISEMENT:
 356                 case NDISC_NEIGHBOUR_SOLICITATION:
 357                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 358                 case NDISC_REDIRECT:
 359                         /* For reaction involving unicast neighbor discovery
 360                          * message destined to the proxied address, pass it to
 361                          * input function.
 362                          */
 363                         return 1;
 364                 default:
 365                         break;
 366                 }
 367         }
 368
 369         /*
 370          * The proxying router can't forward traffic sent to a link-local
 371          * address, so signal the sender and discard the packet. This
 372          * behavior is clarified by the MIPv6 specification.
 373          */
 374         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 375                 dst_link_failure(skb);
 376                 return -1;
 377         }
 378
 379         return 0;
 380 }
 381
 382 static inline int ip6_forward_finish(struct sk_buff *skb)
 383 {
 384         return dst_output(skb);
 385 }
 386
 387 int ip6_forward(struct sk_buff *skb)
 388 {
 389         struct dst_entry *dst = skb_dst(skb);
 390         struct ipv6hdr *hdr = ipv6_hdr(skb);
 391         struct inet6_skb_parm *opt = IP6CB(skb);
 392         struct net *net = dev_net(dst->dev);
 393         struct neighbour *n;
 394         u32 mtu;
 395
 396         if (net->ipv6.devconf_all->forwarding == 0)
 397                 goto error;
 398
 399         if (skb_warn_if_lro(skb))
 400                 goto drop;
 401
 402         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 403                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 404                 goto drop;
 405         }
 406
 407         if (skb->pkt_type != PACKET_HOST)
 408                 goto drop;
 409
 410         skb_forward_csum(skb);
 411
 412         /*
 413          *      We DO NOT make any processing on
 414          *      RA packets, pushing them to user level AS IS
 415          *      without ane WARRANTY that application will be able
 416          *      to interpret them. The reason is that we
 417          *      cannot make anything clever here.
 418          *
 419          *      We are not end-node, so that if packet contains
 420          *      AH/ESP, we cannot make anything.
 421          *      Defragmentation also would be mistake, RA packets
 422          *      cannot be fragmented, because there is no warranty
 423          *      that different fragments will go along one path. --ANK
 424          */
 425         if (opt->ra) {
 426                 u8 *ptr = skb_network_header(skb) + opt->ra;
 427                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 428                         return 0;
 429         }
 430
 431         /*
 432          *      check and decrement ttl
 433          */
 434         if (hdr->hop_limit <= 1) {
 435                 /* Force OUTPUT device used as source address */
 436                 skb->dev = dst->dev;
 437                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 438                 IP6_INC_STATS_BH(net,
 439                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 440
 441                 kfree_skb(skb);
 442                 return -ETIMEDOUT;
 443         }
 444
 445         /* XXX: idev->cnf.proxy_ndp? */
 446         if (net->ipv6.devconf_all->proxy_ndp &&
 447             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 448                 int proxied = ip6_forward_proxy_check(skb);
 449                 if (proxied > 0)
 450                         return ip6_input(skb);
 451                 else if (proxied < 0) {
 452                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 453                                       IPSTATS_MIB_INDISCARDS);
 454                         goto drop;
 455                 }
 456         }
 457
 458         if (!xfrm6_route_forward(skb)) {
 459                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 460                 goto drop;
 461         }
 462         dst = skb_dst(skb);
 463
 464         /* IPv6 specs say nothing about it, but it is clear that we cannot
 465            send redirects to source routed frames.
 466            We don't send redirects to frames decapsulated from IPsec.
 467          */
 468         n = dst_get_neighbour(dst);
 469         if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
 470                 struct in6_addr *target = NULL;
 471                 struct rt6_info *rt;
 472
 473                 /*
 474                  *      incoming and outgoing devices are the same
 475                  *      send a redirect.
 476                  */
 477
 478                 rt = (struct rt6_info *) dst;
 479                 if ((rt->rt6i_flags & RTF_GATEWAY))
 480                         target = (struct in6_addr*)&n->primary_key;
 481                 else
 482                         target = &hdr->daddr;
 483
 484                 if (!rt->rt6i_peer)
 485                         rt6_bind_peer(rt, 1);
 486
 487                 /* Limit redirects both by destination (here)
 488                    and by source (inside ndisc_send_redirect)
 489                  */
 490                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
 491                         ndisc_send_redirect(skb, n, target);
 492         } else {
 493                 int addrtype = ipv6_addr_type(&hdr->saddr);
 494
 495                 /* This check is security critical. */
 496                 if (addrtype == IPV6_ADDR_ANY ||
 497                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 498                         goto error;
 499                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 500                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 501                                     ICMPV6_NOT_NEIGHBOUR, 0);
 502                         goto error;
 503                 }
 504         }
 505
 506         mtu = dst_mtu(dst);
 507         if (mtu < IPV6_MIN_MTU)
 508                 mtu = IPV6_MIN_MTU;
 509
 510         if (skb->len > mtu && !skb_is_gso(skb)) {
 511                 /* Again, force OUTPUT device used as source address */
 512                 skb->dev = dst->dev;
 513                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 514                 IP6_INC_STATS_BH(net,
 515                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 516                 IP6_INC_STATS_BH(net,
 517                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 518                 kfree_skb(skb);
 519                 return -EMSGSIZE;
 520         }
 521
 522         if (skb_cow(skb, dst->dev->hard_header_len)) {
 523                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 524                 goto drop;
 525         }
 526
 527         hdr = ipv6_hdr(skb);
 528
 529         /* Mangling hops number delayed to point after skb COW */
 530
 531         hdr->hop_limit--;
 532
 533         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 534         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 535                        ip6_forward_finish);
 536
 537 error:
 538         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 539 drop:
 540         kfree_skb(skb);
 541         return -EINVAL;
 542 }
 543
 544 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 545 {
 546         to->pkt_type = from->pkt_type;
 547         to->priority = from->priority;
 548         to->protocol = from->protocol;
 549         skb_dst_drop(to);
 550         skb_dst_set(to, dst_clone(skb_dst(from)));
 551         to->dev = from->dev;
 552         to->mark = from->mark;
 553
 554 #ifdef CONFIG_NET_SCHED
 555         to->tc_index = from->tc_index;
 556 #endif
 557         nf_copy(to, from);
 558 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 559     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 560         to->nf_trace = from->nf_trace;
 561 #endif
 562         skb_copy_secmark(to, from);
 563 }
 564
 565 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 566 {
 567         u16 offset = sizeof(struct ipv6hdr);
 568         struct ipv6_opt_hdr *exthdr =
 569                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 570         unsigned int packet_len = skb->tail - skb->network_header;
 571         int found_rhdr = 0;
 572         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 573
 574         while (offset + 1 <= packet_len) {
 575
 576                 switch (**nexthdr) {
 577
 578                 case NEXTHDR_HOP:
 579                         break;
 580                 case NEXTHDR_ROUTING:
 581                         found_rhdr = 1;
 582                         break;
 583                 case NEXTHDR_DEST:
 584 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 585                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 586                                 break;
 587 #endif
 588                         if (found_rhdr)
 589                                 return offset;
 590                         break;
 591                 default :
 592                         return offset;
 593                 }
 594
 595                 offset += ipv6_optlen(exthdr);
 596                 *nexthdr = &exthdr->nexthdr;
 597                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 598                                                  offset);
 599         }
 600
 601         return offset;
 602 }
 603
 604 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
 605 {
 606         static atomic_t ipv6_fragmentation_id;
 607         int old, new;
 608
 609         if (rt) {
 610                 struct inet_peer *peer;
 611
 612                 if (!rt->rt6i_peer)
 613                         rt6_bind_peer(rt, 1);
 614                 peer = rt->rt6i_peer;
 615                 if (peer) {
 616                         fhdr->identification = htonl(inet_getid(peer, 0));
 617                         return;
 618                 }
 619         }
 620         do {
 621                 old = atomic_read(&ipv6_fragmentation_id);
 622                 new = old + 1;
 623                 if (!new)
 624                         new = 1;
 625         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
 626         fhdr->identification = htonl(new);
 627 }
 628
 629 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 630 {
 631         struct sk_buff *frag;
 632         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 633         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 634         struct ipv6hdr *tmp_hdr;
 635         struct frag_hdr *fh;
 636         unsigned int mtu, hlen, left, len;
 637         __be32 frag_id = 0;
 638         int ptr, offset = 0, err=0;
 639         u8 *prevhdr, nexthdr = 0;
 640         struct net *net = dev_net(skb_dst(skb)->dev);
 641
 642         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 643         nexthdr = *prevhdr;
 644
 645         mtu = ip6_skb_dst_mtu(skb);
 646
 647         /* We must not fragment if the socket is set to force MTU discovery
 648          * or if the skb it not generated by a local socket.
 649          */
 650         if (!skb->local_df && skb->len > mtu) {
 651                 skb->dev = skb_dst(skb)->dev;
 652                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 653                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 654                               IPSTATS_MIB_FRAGFAILS);
 655                 kfree_skb(skb);
 656                 return -EMSGSIZE;
 657         }
 658
 659         if (np && np->frag_size < mtu) {
 660                 if (np->frag_size)
 661                         mtu = np->frag_size;
 662         }
 663         mtu -= hlen + sizeof(struct frag_hdr);
 664
 665         if (skb_has_frag_list(skb)) {
 666                 int first_len = skb_pagelen(skb);
 667                 struct sk_buff *frag2;
 668
 669                 if (first_len - hlen > mtu ||
 670                     ((first_len - hlen) & 7) ||
 671                     skb_cloned(skb))
 672                         goto slow_path;
 673
 674                 skb_walk_frags(skb, frag) {
 675                         /* Correct geometry. */
 676                         if (frag->len > mtu ||
 677                             ((frag->len & 7) && frag->next) ||
 678                             skb_headroom(frag) < hlen)
 679                                 goto slow_path_clean;
 680
 681                         /* Partially cloned skb? */
 682                         if (skb_shared(frag))
 683                                 goto slow_path_clean;
 684
 685                         BUG_ON(frag->sk);
 686                         if (skb->sk) {
 687                                 frag->sk = skb->sk;
 688                                 frag->destructor = sock_wfree;
 689                         }
 690                         skb->truesize -= frag->truesize;
 691                 }
 692
 693                 err = 0;
 694                 offset = 0;
 695                 frag = skb_shinfo(skb)->frag_list;
 696                 skb_frag_list_init(skb);
 697                 /* BUILD HEADER */
 698
 699                 *prevhdr = NEXTHDR_FRAGMENT;
 700                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 701                 if (!tmp_hdr) {
 702                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 703                                       IPSTATS_MIB_FRAGFAILS);
 704                         return -ENOMEM;
 705                 }
 706
 707                 __skb_pull(skb, hlen);
 708                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 709                 __skb_push(skb, hlen);
 710                 skb_reset_network_header(skb);
 711                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 712
 713                 ipv6_select_ident(fh, rt);
 714                 fh->nexthdr = nexthdr;
 715                 fh->reserved = 0;
 716                 fh->frag_off = htons(IP6_MF);
 717                 frag_id = fh->identification;
 718
 719                 first_len = skb_pagelen(skb);
 720                 skb->data_len = first_len - skb_headlen(skb);
 721                 skb->len = first_len;
 722                 ipv6_hdr(skb)->payload_len = htons(first_len -
 723                                                    sizeof(struct ipv6hdr));
 724
 725                 dst_hold(&rt->dst);
 726
 727                 for (;;) {
 728                         /* Prepare header of the next frame,
 729                          * before previous one went down. */
 730                         if (frag) {
 731                                 frag->ip_summed = CHECKSUM_NONE;
 732                                 skb_reset_transport_header(frag);
 733                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 734                                 __skb_push(frag, hlen);
 735                                 skb_reset_network_header(frag);
 736                                 memcpy(skb_network_header(frag), tmp_hdr,
 737                                        hlen);
 738                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 739                                 fh->nexthdr = nexthdr;
 740                                 fh->reserved = 0;
 741                                 fh->frag_off = htons(offset);
 742                                 if (frag->next != NULL)
 743                                         fh->frag_off |= htons(IP6_MF);
 744                                 fh->identification = frag_id;
 745                                 ipv6_hdr(frag)->payload_len =
 746                                                 htons(frag->len -
 747                                                       sizeof(struct ipv6hdr));
 748                                 ip6_copy_metadata(frag, skb);
 749                         }
 750
 751                         err = output(skb);
 752                         if(!err)
 753                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 754                                               IPSTATS_MIB_FRAGCREATES);
 755
 756                         if (err || !frag)
 757                                 break;
 758
 759                         skb = frag;
 760                         frag = skb->next;
 761                         skb->next = NULL;
 762                 }
 763
 764                 kfree(tmp_hdr);
 765
 766                 if (err == 0) {
 767                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 768                                       IPSTATS_MIB_FRAGOKS);
 769                         dst_release(&rt->dst);
 770                         return 0;
 771                 }
 772
 773                 while (frag) {
 774                         skb = frag->next;
 775                         kfree_skb(frag);
 776                         frag = skb;
 777                 }
 778
 779                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 780                               IPSTATS_MIB_FRAGFAILS);
 781                 dst_release(&rt->dst);
 782                 return err;
 783
 784 slow_path_clean:
 785                 skb_walk_frags(skb, frag2) {
 786                         if (frag2 == frag)
 787                                 break;
 788                         frag2->sk = NULL;
 789                         frag2->destructor = NULL;
 790                         skb->truesize += frag2->truesize;
 791                 }
 792         }
 793
 794 slow_path:
 795         left = skb->len - hlen;         /* Space per frame */
 796         ptr = hlen;                     /* Where to start from */
 797
 798         /*
 799          *      Fragment the datagram.
 800          */
 801
 802         *prevhdr = NEXTHDR_FRAGMENT;
 803
 804         /*
 805          *      Keep copying data until we run out.
 806          */
 807         while(left > 0) {
 808                 len = left;
 809                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 810                 if (len > mtu)
 811                         len = mtu;
 812                 /* IF: we are not sending up to and including the packet end
 813                    then align the next start on an eight byte boundary */
 814                 if (len < left) {
 815                         len &= ~7;
 816                 }
 817                 /*
 818                  *      Allocate buffer.
 819                  */
 820
 821                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
 822                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 823                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 824                                       IPSTATS_MIB_FRAGFAILS);
 825                         err = -ENOMEM;
 826                         goto fail;
 827                 }
 828
 829                 /*
 830                  *      Set up data on packet
 831                  */
 832
 833                 ip6_copy_metadata(frag, skb);
 834                 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
 835                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 836                 skb_reset_network_header(frag);
 837                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 838                 frag->transport_header = (frag->network_header + hlen +
 839                                           sizeof(struct frag_hdr));
 840
 841                 /*
 842                  *      Charge the memory for the fragment to any owner
 843                  *      it might possess
 844                  */
 845                 if (skb->sk)
 846                         skb_set_owner_w(frag, skb->sk);
 847
 848                 /*
 849                  *      Copy the packet header into the new buffer.
 850                  */
 851                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 852
 853                 /*
 854                  *      Build fragment header.
 855                  */
 856                 fh->nexthdr = nexthdr;
 857                 fh->reserved = 0;
 858                 if (!frag_id) {
 859                         ipv6_select_ident(fh, rt);
 860                         frag_id = fh->identification;
 861                 } else
 862                         fh->identification = frag_id;
 863
 864                 /*
 865                  *      Copy a block of the IP datagram.
 866                  */
 867                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 868                         BUG();
 869                 left -= len;
 870
 871                 fh->frag_off = htons(offset);
 872                 if (left > 0)
 873                         fh->frag_off |= htons(IP6_MF);
 874                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 875                                                     sizeof(struct ipv6hdr));
 876
 877                 ptr += len;
 878                 offset += len;
 879
 880                 /*
 881                  *      Put this fragment into the sending queue.
 882                  */
 883                 err = output(frag);
 884                 if (err)
 885                         goto fail;
 886
 887                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 888                               IPSTATS_MIB_FRAGCREATES);
 889         }
 890         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 891                       IPSTATS_MIB_FRAGOKS);
 892         kfree_skb(skb);
 893         return err;
 894
 895 fail:
 896         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 897                       IPSTATS_MIB_FRAGFAILS);
 898         kfree_skb(skb);
 899         return err;
 900 }
 901
 902 static inline int ip6_rt_check(const struct rt6key *rt_key,
 903                                const struct in6_addr *fl_addr,
 904                                const struct in6_addr *addr_cache)
 905 {
 906         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 907                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 908 }
 909
 910 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 911                                           struct dst_entry *dst,
 912                                           const struct flowi6 *fl6)
 913 {
 914         struct ipv6_pinfo *np = inet6_sk(sk);
 915         struct rt6_info *rt = (struct rt6_info *)dst;
 916
 917         if (!dst)
 918                 goto out;
 919
 920         /* Yes, checking route validity in not connected
 921          * case is not very simple. Take into account,
 922          * that we do not support routing by source, TOS,
 923          * and MSG_DONTROUTE            --ANK (980726)
 924          *
 925          * 1. ip6_rt_check(): If route was host route,
 926          *    check that cached destination is current.
 927          *    If it is network route, we still may
 928          *    check its validity using saved pointer
 929          *    to the last used address: daddr_cache.
 930          *    We do not want to save whole address now,
 931          *    (because main consumer of this service
 932          *    is tcp, which has not this problem),
 933          *    so that the last trick works only on connected
 934          *    sockets.
 935          * 2. oif also should be the same.
 936          */
 937         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 938 #ifdef CONFIG_IPV6_SUBTREES
 939             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 940 #endif
 941             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 942                 dst_release(dst);
 943                 dst = NULL;
 944         }
 945
 946 out:
 947         return dst;
 948 }
 949
 950 static int ip6_dst_lookup_tail(struct sock *sk,
 951                                struct dst_entry **dst, struct flowi6 *fl6)
 952 {
 953         struct net *net = sock_net(sk);
 954 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 955         struct neighbour *n;
 956 #endif
 957         int err;
 958
 959         if (*dst == NULL)
 960                 *dst = ip6_route_output(net, sk, fl6);
 961
 962         if ((err = (*dst)->error))
 963                 goto out_err_release;
 964
 965         if (ipv6_addr_any(&fl6->saddr)) {
 966                 struct rt6_info *rt = (struct rt6_info *) *dst;
 967                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 968                                           sk ? inet6_sk(sk)->srcprefs : 0,
 969                                           &fl6->saddr);
 970                 if (err)
 971                         goto out_err_release;
 972         }
 973
 974 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 975         /*
 976          * Here if the dst entry we've looked up
 977          * has a neighbour entry that is in the INCOMPLETE
 978          * state and the src address from the flow is
 979          * marked as OPTIMISTIC, we release the found
 980          * dst entry and replace it instead with the
 981          * dst entry of the nexthop router
 982          */
 983         rcu_read_lock();
 984         n = dst_get_neighbour(*dst);
 985         if (n && !(n->nud_state & NUD_VALID)) {
 986                 struct inet6_ifaddr *ifp;
 987                 struct flowi6 fl_gw6;
 988                 int redirect;
 989
 990                 rcu_read_unlock();
 991                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 992                                       (*dst)->dev, 1);
 993
 994                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 995                 if (ifp)
 996                         in6_ifa_put(ifp);
 997
 998                 if (redirect) {
 999                         /*
1000                          * We need to get the dst entry for the
1001                          * default router instead
1002                          */
1003                         dst_release(*dst);
1004                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1005                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1006                         *dst = ip6_route_output(net, sk, &fl_gw6);
1007                         if ((err = (*dst)->error))
1008                                 goto out_err_release;
1009                 }
1010         } else {
1011                 rcu_read_unlock();
1012         }
1013 #endif
1014
1015         return 0;
1016
1017 out_err_release:
1018         if (err == -ENETUNREACH)
1019                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1020         dst_release(*dst);
1021         *dst = NULL;
1022         return err;
1023 }
1024
1025 /**
1026  *      ip6_dst_lookup - perform route lookup on flow
1027  *      @sk: socket which provides route info
1028  *      @dst: pointer to dst_entry * for result
1029  *      @fl6: flow to lookup
1030  *
1031  *      This function performs a route lookup on the given flow.
1032  *
1033  *      It returns zero on success, or a standard errno code on error.
1034  */
1035 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1036 {
1037         *dst = NULL;
1038         return ip6_dst_lookup_tail(sk, dst, fl6);
1039 }
1040 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1041
1042 /**
1043  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1044  *      @sk: socket which provides route info
1045  *      @fl6: flow to lookup
1046  *      @final_dst: final destination address for ipsec lookup
1047  *      @can_sleep: we are in a sleepable context
1048  *
1049  *      This function performs a route lookup on the given flow.
1050  *
1051  *      It returns a valid dst pointer on success, or a pointer encoded
1052  *      error code.
1053  */
1054 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1055                                       const struct in6_addr *final_dst,
1056                                       bool can_sleep)
1057 {
1058         struct dst_entry *dst = NULL;
1059         int err;
1060
1061         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1062         if (err)
1063                 return ERR_PTR(err);
1064         if (final_dst)
1065                 ipv6_addr_copy(&fl6->daddr, final_dst);
1066         if (can_sleep)
1067                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1068
1069         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1070 }
1071 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1072
1073 /**
1074  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1075  *      @sk: socket which provides the dst cache and route info
1076  *      @fl6: flow to lookup
1077  *      @final_dst: final destination address for ipsec lookup
1078  *      @can_sleep: we are in a sleepable context
1079  *
1080  *      This function performs a route lookup on the given flow with the
1081  *      possibility of using the cached route in the socket if it is valid.
1082  *      It will take the socket dst lock when operating on the dst cache.
1083  *      As a result, this function can only be used in process context.
1084  *
1085  *      It returns a valid dst pointer on success, or a pointer encoded
1086  *      error code.
1087  */
1088 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1089                                          const struct in6_addr *final_dst,
1090                                          bool can_sleep)
1091 {
1092         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1093         int err;
1094
1095         dst = ip6_sk_dst_check(sk, dst, fl6);
1096
1097         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1098         if (err)
1099                 return ERR_PTR(err);
1100         if (final_dst)
1101                 ipv6_addr_copy(&fl6->daddr, final_dst);
1102         if (can_sleep)
1103                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1104
1105         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1106 }
1107 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1108
1109 static inline int ip6_ufo_append_data(struct sock *sk,
1110                         int getfrag(void *from, char *to, int offset, int len,
1111                         int odd, struct sk_buff *skb),
1112                         void *from, int length, int hh_len, int fragheaderlen,
1113                         int transhdrlen, int mtu,unsigned int flags,
1114                         struct rt6_info *rt)
1115
1116 {
1117         struct sk_buff *skb;
1118         int err;
1119
1120         /* There is support for UDP large send offload by network
1121          * device, so create one single skb packet containing complete
1122          * udp datagram
1123          */
1124         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1125                 skb = sock_alloc_send_skb(sk,
1126                         hh_len + fragheaderlen + transhdrlen + 20,
1127                         (flags & MSG_DONTWAIT), &err);
1128                 if (skb == NULL)
1129                         return -ENOMEM;
1130
1131                 /* reserve space for Hardware header */
1132                 skb_reserve(skb, hh_len);
1133
1134                 /* create space for UDP/IP header */
1135                 skb_put(skb,fragheaderlen + transhdrlen);
1136
1137                 /* initialize network header pointer */
1138                 skb_reset_network_header(skb);
1139
1140                 /* initialize protocol header pointer */
1141                 skb->transport_header = skb->network_header + fragheaderlen;
1142
1143                 skb->ip_summed = CHECKSUM_PARTIAL;
1144                 skb->csum = 0;
1145         }
1146
1147         err = skb_append_datato_frags(sk,skb, getfrag, from,
1148                                       (length - transhdrlen));
1149         if (!err) {
1150                 struct frag_hdr fhdr;
1151
1152                 /* Specify the length of each IPv6 datagram fragment.
1153                  * It has to be a multiple of 8.
1154                  */
1155                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1156                                              sizeof(struct frag_hdr)) & ~7;
1157                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1158                 ipv6_select_ident(&fhdr, rt);
1159                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1160                 __skb_queue_tail(&sk->sk_write_queue, skb);
1161
1162                 return 0;
1163         }
1164         /* There is not enough support do UPD LSO,
1165          * so follow normal path
1166          */
1167         kfree_skb(skb);
1168
1169         return err;
1170 }
1171
1172 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1173                                                gfp_t gfp)
1174 {
1175         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1176 }
1177
1178 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1179                                                 gfp_t gfp)
1180 {
1181         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1182 }
1183
1184 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1185         int offset, int len, int odd, struct sk_buff *skb),
1186         void *from, int length, int transhdrlen,
1187         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1188         struct rt6_info *rt, unsigned int flags, int dontfrag)
1189 {
1190         struct inet_sock *inet = inet_sk(sk);
1191         struct ipv6_pinfo *np = inet6_sk(sk);
1192         struct inet_cork *cork;
1193         struct sk_buff *skb;
1194         unsigned int maxfraglen, fragheaderlen;
1195         int exthdrlen;
1196         int hh_len;
1197         int mtu;
1198         int copy;
1199         int err;
1200         int offset = 0;
1201         int csummode = CHECKSUM_NONE;
1202         __u8 tx_flags = 0;
1203
1204         if (flags&MSG_PROBE)
1205                 return 0;
1206         cork = &inet->cork.base;
1207         if (skb_queue_empty(&sk->sk_write_queue)) {
1208                 /*
1209                  * setup for corking
1210                  */
1211                 if (opt) {
1212                         if (WARN_ON(np->cork.opt))
1213                                 return -EINVAL;
1214
1215                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1216                         if (unlikely(np->cork.opt == NULL))
1217                                 return -ENOBUFS;
1218
1219                         np->cork.opt->tot_len = opt->tot_len;
1220                         np->cork.opt->opt_flen = opt->opt_flen;
1221                         np->cork.opt->opt_nflen = opt->opt_nflen;
1222
1223                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1224                                                             sk->sk_allocation);
1225                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1226                                 return -ENOBUFS;
1227
1228                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1229                                                             sk->sk_allocation);
1230                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1231                                 return -ENOBUFS;
1232
1233                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1234                                                            sk->sk_allocation);
1235                         if (opt->hopopt && !np->cork.opt->hopopt)
1236                                 return -ENOBUFS;
1237
1238                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1239                                                             sk->sk_allocation);
1240                         if (opt->srcrt && !np->cork.opt->srcrt)
1241                                 return -ENOBUFS;
1242
1243                         /* need source address above miyazawa*/
1244                 }
1245                 dst_hold(&rt->dst);
1246                 cork->dst = &rt->dst;
1247                 inet->cork.fl.u.ip6 = *fl6;
1248                 np->cork.hop_limit = hlimit;
1249                 np->cork.tclass = tclass;
1250                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1251                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1252                 if (np->frag_size < mtu) {
1253                         if (np->frag_size)
1254                                 mtu = np->frag_size;
1255                 }
1256                 cork->fragsize = mtu;
1257                 if (dst_allfrag(rt->dst.path))
1258                         cork->flags |= IPCORK_ALLFRAG;
1259                 cork->length = 0;
1260                 sk->sk_sndmsg_page = NULL;
1261                 sk->sk_sndmsg_off = 0;
1262                 exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1263                             rt->rt6i_nfheader_len;
1264                 length += exthdrlen;
1265                 transhdrlen += exthdrlen;
1266         } else {
1267                 rt = (struct rt6_info *)cork->dst;
1268                 fl6 = &inet->cork.fl.u.ip6;
1269                 opt = np->cork.opt;
1270                 transhdrlen = 0;
1271                 exthdrlen = 0;
1272                 mtu = cork->fragsize;
1273         }
1274
1275         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1276
1277         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1278                         (opt ? opt->opt_nflen : 0);
1279         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1280
1281         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1282                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1283                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1284                         return -EMSGSIZE;
1285                 }
1286         }
1287
1288         /* For UDP, check if TX timestamp is enabled */
1289         if (sk->sk_type == SOCK_DGRAM) {
1290                 err = sock_tx_timestamp(sk, &tx_flags);
1291                 if (err)
1292                         goto error;
1293         }
1294
1295         /*
1296          * Let's try using as much space as possible.
1297          * Use MTU if total length of the message fits into the MTU.
1298          * Otherwise, we need to reserve fragment header and
1299          * fragment alignment (= 8-15 octects, in total).
1300          *
1301          * Note that we may need to "move" the data from the tail of
1302          * of the buffer to the new fragment when we split
1303          * the message.
1304          *
1305          * FIXME: It may be fragmented into multiple chunks
1306          *        at once if non-fragmentable extension headers
1307          *        are too large.
1308          * --yoshfuji
1309          */
1310
1311         cork->length += length;
1312         if (length > mtu) {
1313                 int proto = sk->sk_protocol;
1314                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1315                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1316                         return -EMSGSIZE;
1317                 }
1318
1319                 if (proto == IPPROTO_UDP &&
1320                     (rt->dst.dev->features & NETIF_F_UFO)) {
1321
1322                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1323                                                   hh_len, fragheaderlen,
1324                                                   transhdrlen, mtu, flags, rt);
1325                         if (err)
1326                                 goto error;
1327                         return 0;
1328                 }
1329         }
1330
1331         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1332                 goto alloc_new_skb;
1333
1334         while (length > 0) {
1335                 /* Check if the remaining data fits into current packet. */
1336                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1337                 if (copy < length)
1338                         copy = maxfraglen - skb->len;
1339
1340                 if (copy <= 0) {
1341                         char *data;
1342                         unsigned int datalen;
1343                         unsigned int fraglen;
1344                         unsigned int fraggap;
1345                         unsigned int alloclen;
1346                         struct sk_buff *skb_prev;
1347 alloc_new_skb:
1348                         skb_prev = skb;
1349
1350                         /* There's no room in the current skb */
1351                         if (skb_prev)
1352                                 fraggap = skb_prev->len - maxfraglen;
1353                         else
1354                                 fraggap = 0;
1355
1356                         /*
1357                          * If remaining data exceeds the mtu,
1358                          * we know we need more fragment(s).
1359                          */
1360                         datalen = length + fraggap;
1361                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1362                                 datalen = maxfraglen - fragheaderlen;
1363
1364                         fraglen = datalen + fragheaderlen;
1365                         if ((flags & MSG_MORE) &&
1366                             !(rt->dst.dev->features&NETIF_F_SG))
1367                                 alloclen = mtu;
1368                         else
1369                                 alloclen = datalen + fragheaderlen;
1370
1371                         /*
1372                          * The last fragment gets additional space at tail.
1373                          * Note: we overallocate on fragments with MSG_MODE
1374                          * because we have no idea if we're the last one.
1375                          */
1376                         if (datalen == length + fraggap)
1377                                 alloclen += rt->dst.trailer_len;
1378
1379                         /*
1380                          * We just reserve space for fragment header.
1381                          * Note: this may be overallocation if the message
1382                          * (without MSG_MORE) fits into the MTU.
1383                          */
1384                         alloclen += sizeof(struct frag_hdr);
1385
1386                         if (transhdrlen) {
1387                                 skb = sock_alloc_send_skb(sk,
1388                                                 alloclen + hh_len,
1389                                                 (flags & MSG_DONTWAIT), &err);
1390                         } else {
1391                                 skb = NULL;
1392                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1393                                     2 * sk->sk_sndbuf)
1394                                         skb = sock_wmalloc(sk,
1395                                                            alloclen + hh_len, 1,
1396                                                            sk->sk_allocation);
1397                                 if (unlikely(skb == NULL))
1398                                         err = -ENOBUFS;
1399                                 else {
1400                                         /* Only the initial fragment
1401                                          * is time stamped.
1402                                          */
1403                                         tx_flags = 0;
1404                                 }
1405                         }
1406                         if (skb == NULL)
1407                                 goto error;
1408                         /*
1409                          *      Fill in the control structures
1410                          */
1411                         skb->ip_summed = csummode;
1412                         skb->csum = 0;
1413                         /* reserve for fragmentation */
1414                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1415
1416                         if (sk->sk_type == SOCK_DGRAM)
1417                                 skb_shinfo(skb)->tx_flags = tx_flags;
1418
1419                         /*
1420                          *      Find where to start putting bytes
1421                          */
1422                         data = skb_put(skb, fraglen);
1423                         skb_set_network_header(skb, exthdrlen);
1424                         data += fragheaderlen;
1425                         skb->transport_header = (skb->network_header +
1426                                                  fragheaderlen);
1427                         if (fraggap) {
1428                                 skb->csum = skb_copy_and_csum_bits(
1429                                         skb_prev, maxfraglen,
1430                                         data + transhdrlen, fraggap, 0);
1431                                 skb_prev->csum = csum_sub(skb_prev->csum,
1432                                                           skb->csum);
1433                                 data += fraggap;
1434                                 pskb_trim_unique(skb_prev, maxfraglen);
1435                         }
1436                         copy = datalen - transhdrlen - fraggap;
1437                         if (copy < 0) {
1438                                 err = -EINVAL;
1439                                 kfree_skb(skb);
1440                                 goto error;
1441                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1442                                 err = -EFAULT;
1443                                 kfree_skb(skb);
1444                                 goto error;
1445                         }
1446
1447                         offset += copy;
1448                         length -= datalen - fraggap;
1449                         transhdrlen = 0;
1450                         exthdrlen = 0;
1451                         csummode = CHECKSUM_NONE;
1452
1453                         /*
1454                          * Put the packet on the pending queue
1455                          */
1456                         __skb_queue_tail(&sk->sk_write_queue, skb);
1457                         continue;
1458                 }
1459
1460                 if (copy > length)
1461                         copy = length;
1462
1463                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1464                         unsigned int off;
1465
1466                         off = skb->len;
1467                         if (getfrag(from, skb_put(skb, copy),
1468                                                 offset, copy, off, skb) < 0) {
1469                                 __skb_trim(skb, off);
1470                                 err = -EFAULT;
1471                                 goto error;
1472                         }
1473                 } else {
1474                         int i = skb_shinfo(skb)->nr_frags;
1475                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1476                         struct page *page = sk->sk_sndmsg_page;
1477                         int off = sk->sk_sndmsg_off;
1478                         unsigned int left;
1479
1480                         if (page && (left = PAGE_SIZE - off) > 0) {
1481                                 if (copy >= left)
1482                                         copy = left;
1483                                 if (page != skb_frag_page(frag)) {
1484                                         if (i == MAX_SKB_FRAGS) {
1485                                                 err = -EMSGSIZE;
1486                                                 goto error;
1487                                         }
1488                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1489                                         skb_frag_ref(skb, i);
1490                                         frag = &skb_shinfo(skb)->frags[i];
1491                                 }
1492                         } else if(i < MAX_SKB_FRAGS) {
1493                                 if (copy > PAGE_SIZE)
1494                                         copy = PAGE_SIZE;
1495                                 page = alloc_pages(sk->sk_allocation, 0);
1496                                 if (page == NULL) {
1497                                         err = -ENOMEM;
1498                                         goto error;
1499                                 }
1500                                 sk->sk_sndmsg_page = page;
1501                                 sk->sk_sndmsg_off = 0;
1502
1503                                 skb_fill_page_desc(skb, i, page, 0, 0);
1504                                 frag = &skb_shinfo(skb)->frags[i];
1505                         } else {
1506                                 err = -EMSGSIZE;
1507                                 goto error;
1508                         }
1509                         if (getfrag(from, skb_frag_address(frag)+frag->size,
1510                                     offset, copy, skb->len, skb) < 0) {
1511                                 err = -EFAULT;
1512                                 goto error;
1513                         }
1514                         sk->sk_sndmsg_off += copy;
1515                         frag->size += copy;
1516                         skb->len += copy;
1517                         skb->data_len += copy;
1518                         skb->truesize += copy;
1519                         atomic_add(copy, &sk->sk_wmem_alloc);
1520                 }
1521                 offset += copy;
1522                 length -= copy;
1523         }
1524         return 0;
1525 error:
1526         cork->length -= length;
1527         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1528         return err;
1529 }
1530
1531 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1532 {
1533         if (np->cork.opt) {
1534                 kfree(np->cork.opt->dst0opt);
1535                 kfree(np->cork.opt->dst1opt);
1536                 kfree(np->cork.opt->hopopt);
1537                 kfree(np->cork.opt->srcrt);
1538                 kfree(np->cork.opt);
1539                 np->cork.opt = NULL;
1540         }
1541
1542         if (inet->cork.base.dst) {
1543                 dst_release(inet->cork.base.dst);
1544                 inet->cork.base.dst = NULL;
1545                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1546         }
1547         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1548 }
1549
1550 int ip6_push_pending_frames(struct sock *sk)
1551 {
1552         struct sk_buff *skb, *tmp_skb;
1553         struct sk_buff **tail_skb;
1554         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1555         struct inet_sock *inet = inet_sk(sk);
1556         struct ipv6_pinfo *np = inet6_sk(sk);
1557         struct net *net = sock_net(sk);
1558         struct ipv6hdr *hdr;
1559         struct ipv6_txoptions *opt = np->cork.opt;
1560         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1561         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1562         unsigned char proto = fl6->flowi6_proto;
1563         int err = 0;
1564
1565         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1566                 goto out;
1567         tail_skb = &(skb_shinfo(skb)->frag_list);
1568
1569         /* move skb->data to ip header from ext header */
1570         if (skb->data < skb_network_header(skb))
1571                 __skb_pull(skb, skb_network_offset(skb));
1572         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1573                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1574                 *tail_skb = tmp_skb;
1575                 tail_skb = &(tmp_skb->next);
1576                 skb->len += tmp_skb->len;
1577                 skb->data_len += tmp_skb->len;
1578                 skb->truesize += tmp_skb->truesize;
1579                 tmp_skb->destructor = NULL;
1580                 tmp_skb->sk = NULL;
1581         }
1582
1583         /* Allow local fragmentation. */
1584         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1585                 skb->local_df = 1;
1586
1587         ipv6_addr_copy(final_dst, &fl6->daddr);
1588         __skb_pull(skb, skb_network_header_len(skb));
1589         if (opt && opt->opt_flen)
1590                 ipv6_push_frag_opts(skb, opt, &proto);
1591         if (opt && opt->opt_nflen)
1592                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1593
1594         skb_push(skb, sizeof(struct ipv6hdr));
1595         skb_reset_network_header(skb);
1596         hdr = ipv6_hdr(skb);
1597
1598         *(__be32*)hdr = fl6->flowlabel |
1599                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1600
1601         hdr->hop_limit = np->cork.hop_limit;
1602         hdr->nexthdr = proto;
1603         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1604         ipv6_addr_copy(&hdr->daddr, final_dst);
1605
1606         skb->priority = sk->sk_priority;
1607         skb->mark = sk->sk_mark;
1608
1609         skb_dst_set(skb, dst_clone(&rt->dst));
1610         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1611         if (proto == IPPROTO_ICMPV6) {
1612                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1613
1614                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1615                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1616         }
1617
1618         err = ip6_local_out(skb);
1619         if (err) {
1620                 if (err > 0)
1621                         err = net_xmit_errno(err);
1622                 if (err)
1623                         goto error;
1624         }
1625
1626 out:
1627         ip6_cork_release(inet, np);
1628         return err;
1629 error:
1630         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1631         goto out;
1632 }
1633
1634 void ip6_flush_pending_frames(struct sock *sk)
1635 {
1636         struct sk_buff *skb;
1637
1638         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1639                 if (skb_dst(skb))
1640                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1641                                       IPSTATS_MIB_OUTDISCARDS);
1642                 kfree_skb(skb);
1643         }
1644
1645         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1646 }