net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 int __ip6_local_out(struct sk_buff *skb)
  62 {
  63         int len;
  64
  65         len = skb->len - sizeof(struct ipv6hdr);
  66         if (len > IPV6_MAXPLEN)
  67                 len = 0;
  68         ipv6_hdr(skb)->payload_len = htons(len);
  69
  70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  71                        skb_dst(skb)->dev, dst_output);
  72 }
  73
  74 int ip6_local_out(struct sk_buff *skb)
  75 {
  76         int err;
  77
  78         err = __ip6_local_out(skb);
  79         if (likely(err == 1))
  80                 err = dst_output(skb);
  81
  82         return err;
  83 }
  84 EXPORT_SYMBOL_GPL(ip6_local_out);
  85
  86 /* dev_loopback_xmit for use with netfilter. */
  87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
  88 {
  89         skb_reset_mac_header(newskb);
  90         __skb_pull(newskb, skb_network_offset(newskb));
  91         newskb->pkt_type = PACKET_LOOPBACK;
  92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
  93         WARN_ON(!skb_dst(newskb));
  94
  95         netif_rx_ni(newskb);
  96         return 0;
  97 }
  98
  99 static int ip6_finish_output2(struct sk_buff *skb)
 100 {
 101         struct dst_entry *dst = skb_dst(skb);
 102         struct net_device *dev = dst->dev;
 103
 104         skb->protocol = htons(ETH_P_IPV6);
 105         skb->dev = dev;
 106
 107         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 108                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 109
 110                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
 111                     ((mroute6_socket(dev_net(dev), skb) &&
 112                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 113                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 114                                          &ipv6_hdr(skb)->saddr))) {
 115                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 116
 117                         /* Do not check for IFF_ALLMULTI; multicast routing
 118                            is not supported in any case.
 119                          */
 120                         if (newskb)
 121                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 122                                         newskb, NULL, newskb->dev,
 123                                         ip6_dev_loopback_xmit);
 124
 125                         if (ipv6_hdr(skb)->hop_limit == 0) {
 126                                 IP6_INC_STATS(dev_net(dev), idev,
 127                                               IPSTATS_MIB_OUTDISCARDS);
 128                                 kfree_skb(skb);
 129                                 return 0;
 130                         }
 131                 }
 132
 133                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 134                                 skb->len);
 135         }
 136
 137         if (dst->hh)
 138                 return neigh_hh_output(dst->hh, skb);
 139         else if (dst->neighbour)
 140                 return dst->neighbour->output(skb);
 141
 142         IP6_INC_STATS_BH(dev_net(dst->dev),
 143                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 144         kfree_skb(skb);
 145         return -EINVAL;
 146 }
 147
 148 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 149 {
 150         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 151
 152         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 153                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 154 }
 155
 156 static int ip6_finish_output(struct sk_buff *skb)
 157 {
 158         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 159             dst_allfrag(skb_dst(skb)))
 160                 return ip6_fragment(skb, ip6_finish_output2);
 161         else
 162                 return ip6_finish_output2(skb);
 163 }
 164
 165 int ip6_output(struct sk_buff *skb)
 166 {
 167         struct net_device *dev = skb_dst(skb)->dev;
 168         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 169         if (unlikely(idev->cnf.disable_ipv6)) {
 170                 IP6_INC_STATS(dev_net(dev), idev,
 171                               IPSTATS_MIB_OUTDISCARDS);
 172                 kfree_skb(skb);
 173                 return 0;
 174         }
 175
 176         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 177                             ip6_finish_output,
 178                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 179 }
 180
 181 /*
 182  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 183  */
 184
 185 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 186              struct ipv6_txoptions *opt)
 187 {
 188         struct net *net = sock_net(sk);
 189         struct ipv6_pinfo *np = inet6_sk(sk);
 190         struct in6_addr *first_hop = &fl->fl6_dst;
 191         struct dst_entry *dst = skb_dst(skb);
 192         struct ipv6hdr *hdr;
 193         u8  proto = fl->proto;
 194         int seg_len = skb->len;
 195         int hlimit = -1;
 196         int tclass = 0;
 197         u32 mtu;
 198
 199         if (opt) {
 200                 unsigned int head_room;
 201
 202                 /* First: exthdrs may take lots of space (~8K for now)
 203                    MAX_HEADER is not enough.
 204                  */
 205                 head_room = opt->opt_nflen + opt->opt_flen;
 206                 seg_len += head_room;
 207                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 208
 209                 if (skb_headroom(skb) < head_room) {
 210                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 211                         if (skb2 == NULL) {
 212                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 213                                               IPSTATS_MIB_OUTDISCARDS);
 214                                 kfree_skb(skb);
 215                                 return -ENOBUFS;
 216                         }
 217                         kfree_skb(skb);
 218                         skb = skb2;
 219                         skb_set_owner_w(skb, sk);
 220                 }
 221                 if (opt->opt_flen)
 222                         ipv6_push_frag_opts(skb, opt, &proto);
 223                 if (opt->opt_nflen)
 224                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 225         }
 226
 227         skb_push(skb, sizeof(struct ipv6hdr));
 228         skb_reset_network_header(skb);
 229         hdr = ipv6_hdr(skb);
 230
 231         /*
 232          *      Fill in the IPv6 header
 233          */
 234         if (np) {
 235                 tclass = np->tclass;
 236                 hlimit = np->hop_limit;
 237         }
 238         if (hlimit < 0)
 239                 hlimit = ip6_dst_hoplimit(dst);
 240
 241         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 242
 243         hdr->payload_len = htons(seg_len);
 244         hdr->nexthdr = proto;
 245         hdr->hop_limit = hlimit;
 246
 247         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 248         ipv6_addr_copy(&hdr->daddr, first_hop);
 249
 250         skb->priority = sk->sk_priority;
 251         skb->mark = sk->sk_mark;
 252
 253         mtu = dst_mtu(dst);
 254         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 255                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 256                               IPSTATS_MIB_OUT, skb->len);
 257                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 258                                dst->dev, dst_output);
 259         }
 260
 261         if (net_ratelimit())
 262                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 263         skb->dev = dst->dev;
 264         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 265         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 266         kfree_skb(skb);
 267         return -EMSGSIZE;
 268 }
 269
 270 EXPORT_SYMBOL(ip6_xmit);
 271
 272 /*
 273  *      To avoid extra problems ND packets are send through this
 274  *      routine. It's code duplication but I really want to avoid
 275  *      extra checks since ipv6_build_header is used by TCP (which
 276  *      is for us performance critical)
 277  */
 278
 279 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 280                const struct in6_addr *saddr, const struct in6_addr *daddr,
 281                int proto, int len)
 282 {
 283         struct ipv6_pinfo *np = inet6_sk(sk);
 284         struct ipv6hdr *hdr;
 285         int totlen;
 286
 287         skb->protocol = htons(ETH_P_IPV6);
 288         skb->dev = dev;
 289
 290         totlen = len + sizeof(struct ipv6hdr);
 291
 292         skb_reset_network_header(skb);
 293         skb_put(skb, sizeof(struct ipv6hdr));
 294         hdr = ipv6_hdr(skb);
 295
 296         *(__be32*)hdr = htonl(0x60000000);
 297
 298         hdr->payload_len = htons(len);
 299         hdr->nexthdr = proto;
 300         hdr->hop_limit = np->hop_limit;
 301
 302         ipv6_addr_copy(&hdr->saddr, saddr);
 303         ipv6_addr_copy(&hdr->daddr, daddr);
 304
 305         return 0;
 306 }
 307
 308 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 309 {
 310         struct ip6_ra_chain *ra;
 311         struct sock *last = NULL;
 312
 313         read_lock(&ip6_ra_lock);
 314         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 315                 struct sock *sk = ra->sk;
 316                 if (sk && ra->sel == sel &&
 317                     (!sk->sk_bound_dev_if ||
 318                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 319                         if (last) {
 320                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 321                                 if (skb2)
 322                                         rawv6_rcv(last, skb2);
 323                         }
 324                         last = sk;
 325                 }
 326         }
 327
 328         if (last) {
 329                 rawv6_rcv(last, skb);
 330                 read_unlock(&ip6_ra_lock);
 331                 return 1;
 332         }
 333         read_unlock(&ip6_ra_lock);
 334         return 0;
 335 }
 336
 337 static int ip6_forward_proxy_check(struct sk_buff *skb)
 338 {
 339         struct ipv6hdr *hdr = ipv6_hdr(skb);
 340         u8 nexthdr = hdr->nexthdr;
 341         int offset;
 342
 343         if (ipv6_ext_hdr(nexthdr)) {
 344                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 345                 if (offset < 0)
 346                         return 0;
 347         } else
 348                 offset = sizeof(struct ipv6hdr);
 349
 350         if (nexthdr == IPPROTO_ICMPV6) {
 351                 struct icmp6hdr *icmp6;
 352
 353                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 354                                          offset + 1 - skb->data)))
 355                         return 0;
 356
 357                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 358
 359                 switch (icmp6->icmp6_type) {
 360                 case NDISC_ROUTER_SOLICITATION:
 361                 case NDISC_ROUTER_ADVERTISEMENT:
 362                 case NDISC_NEIGHBOUR_SOLICITATION:
 363                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 364                 case NDISC_REDIRECT:
 365                         /* For reaction involving unicast neighbor discovery
 366                          * message destined to the proxied address, pass it to
 367                          * input function.
 368                          */
 369                         return 1;
 370                 default:
 371                         break;
 372                 }
 373         }
 374
 375         /*
 376          * The proxying router can't forward traffic sent to a link-local
 377          * address, so signal the sender and discard the packet. This
 378          * behavior is clarified by the MIPv6 specification.
 379          */
 380         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 381                 dst_link_failure(skb);
 382                 return -1;
 383         }
 384
 385         return 0;
 386 }
 387
 388 static inline int ip6_forward_finish(struct sk_buff *skb)
 389 {
 390         return dst_output(skb);
 391 }
 392
 393 int ip6_forward(struct sk_buff *skb)
 394 {
 395         struct dst_entry *dst = skb_dst(skb);
 396         struct ipv6hdr *hdr = ipv6_hdr(skb);
 397         struct inet6_skb_parm *opt = IP6CB(skb);
 398         struct net *net = dev_net(dst->dev);
 399         u32 mtu;
 400
 401         if (net->ipv6.devconf_all->forwarding == 0)
 402                 goto error;
 403
 404         if (skb_warn_if_lro(skb))
 405                 goto drop;
 406
 407         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 408                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 409                 goto drop;
 410         }
 411
 412         skb_forward_csum(skb);
 413
 414         /*
 415          *      We DO NOT make any processing on
 416          *      RA packets, pushing them to user level AS IS
 417          *      without ane WARRANTY that application will be able
 418          *      to interpret them. The reason is that we
 419          *      cannot make anything clever here.
 420          *
 421          *      We are not end-node, so that if packet contains
 422          *      AH/ESP, we cannot make anything.
 423          *      Defragmentation also would be mistake, RA packets
 424          *      cannot be fragmented, because there is no warranty
 425          *      that different fragments will go along one path. --ANK
 426          */
 427         if (opt->ra) {
 428                 u8 *ptr = skb_network_header(skb) + opt->ra;
 429                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 430                         return 0;
 431         }
 432
 433         /*
 434          *      check and decrement ttl
 435          */
 436         if (hdr->hop_limit <= 1) {
 437                 /* Force OUTPUT device used as source address */
 438                 skb->dev = dst->dev;
 439                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 440                 IP6_INC_STATS_BH(net,
 441                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 442
 443                 kfree_skb(skb);
 444                 return -ETIMEDOUT;
 445         }
 446
 447         /* XXX: idev->cnf.proxy_ndp? */
 448         if (net->ipv6.devconf_all->proxy_ndp &&
 449             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 450                 int proxied = ip6_forward_proxy_check(skb);
 451                 if (proxied > 0)
 452                         return ip6_input(skb);
 453                 else if (proxied < 0) {
 454                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 455                                       IPSTATS_MIB_INDISCARDS);
 456                         goto drop;
 457                 }
 458         }
 459
 460         if (!xfrm6_route_forward(skb)) {
 461                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 462                 goto drop;
 463         }
 464         dst = skb_dst(skb);
 465
 466         /* IPv6 specs say nothing about it, but it is clear that we cannot
 467            send redirects to source routed frames.
 468            We don't send redirects to frames decapsulated from IPsec.
 469          */
 470         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 471             !skb_sec_path(skb)) {
 472                 struct in6_addr *target = NULL;
 473                 struct rt6_info *rt;
 474                 struct neighbour *n = dst->neighbour;
 475
 476                 /*
 477                  *      incoming and outgoing devices are the same
 478                  *      send a redirect.
 479                  */
 480
 481                 rt = (struct rt6_info *) dst;
 482                 if ((rt->rt6i_flags & RTF_GATEWAY))
 483                         target = (struct in6_addr*)&n->primary_key;
 484                 else
 485                         target = &hdr->daddr;
 486
 487                 /* Limit redirects both by destination (here)
 488                    and by source (inside ndisc_send_redirect)
 489                  */
 490                 if (xrlim_allow(dst, 1*HZ))
 491                         ndisc_send_redirect(skb, n, target);
 492         } else {
 493                 int addrtype = ipv6_addr_type(&hdr->saddr);
 494
 495                 /* This check is security critical. */
 496                 if (addrtype == IPV6_ADDR_ANY ||
 497                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 498                         goto error;
 499                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 500                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 501                                     ICMPV6_NOT_NEIGHBOUR, 0);
 502                         goto error;
 503                 }
 504         }
 505
 506         mtu = dst_mtu(dst);
 507         if (mtu < IPV6_MIN_MTU)
 508                 mtu = IPV6_MIN_MTU;
 509
 510         if (skb->len > mtu && !skb_is_gso(skb)) {
 511                 /* Again, force OUTPUT device used as source address */
 512                 skb->dev = dst->dev;
 513                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 514                 IP6_INC_STATS_BH(net,
 515                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 516                 IP6_INC_STATS_BH(net,
 517                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 518                 kfree_skb(skb);
 519                 return -EMSGSIZE;
 520         }
 521
 522         if (skb_cow(skb, dst->dev->hard_header_len)) {
 523                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 524                 goto drop;
 525         }
 526
 527         hdr = ipv6_hdr(skb);
 528
 529         /* Mangling hops number delayed to point after skb COW */
 530
 531         hdr->hop_limit--;
 532
 533         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 534         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 535                        ip6_forward_finish);
 536
 537 error:
 538         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 539 drop:
 540         kfree_skb(skb);
 541         return -EINVAL;
 542 }
 543
 544 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 545 {
 546         to->pkt_type = from->pkt_type;
 547         to->priority = from->priority;
 548         to->protocol = from->protocol;
 549         skb_dst_drop(to);
 550         skb_dst_set(to, dst_clone(skb_dst(from)));
 551         to->dev = from->dev;
 552         to->mark = from->mark;
 553
 554 #ifdef CONFIG_NET_SCHED
 555         to->tc_index = from->tc_index;
 556 #endif
 557         nf_copy(to, from);
 558 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 559     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 560         to->nf_trace = from->nf_trace;
 561 #endif
 562         skb_copy_secmark(to, from);
 563 }
 564
 565 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 566 {
 567         u16 offset = sizeof(struct ipv6hdr);
 568         struct ipv6_opt_hdr *exthdr =
 569                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 570         unsigned int packet_len = skb->tail - skb->network_header;
 571         int found_rhdr = 0;
 572         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 573
 574         while (offset + 1 <= packet_len) {
 575
 576                 switch (**nexthdr) {
 577
 578                 case NEXTHDR_HOP:
 579                         break;
 580                 case NEXTHDR_ROUTING:
 581                         found_rhdr = 1;
 582                         break;
 583                 case NEXTHDR_DEST:
 584 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 585                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 586                                 break;
 587 #endif
 588                         if (found_rhdr)
 589                                 return offset;
 590                         break;
 591                 default :
 592                         return offset;
 593                 }
 594
 595                 offset += ipv6_optlen(exthdr);
 596                 *nexthdr = &exthdr->nexthdr;
 597                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 598                                                  offset);
 599         }
 600
 601         return offset;
 602 }
 603
 604 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 605 {
 606         struct sk_buff *frag;
 607         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 608         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 609         struct ipv6hdr *tmp_hdr;
 610         struct frag_hdr *fh;
 611         unsigned int mtu, hlen, left, len;
 612         __be32 frag_id = 0;
 613         int ptr, offset = 0, err=0;
 614         u8 *prevhdr, nexthdr = 0;
 615         struct net *net = dev_net(skb_dst(skb)->dev);
 616
 617         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 618         nexthdr = *prevhdr;
 619
 620         mtu = ip6_skb_dst_mtu(skb);
 621
 622         /* We must not fragment if the socket is set to force MTU discovery
 623          * or if the skb it not generated by a local socket.
 624          */
 625         if (!skb->local_df && skb->len > mtu) {
 626                 skb->dev = skb_dst(skb)->dev;
 627                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 628                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 629                               IPSTATS_MIB_FRAGFAILS);
 630                 kfree_skb(skb);
 631                 return -EMSGSIZE;
 632         }
 633
 634         if (np && np->frag_size < mtu) {
 635                 if (np->frag_size)
 636                         mtu = np->frag_size;
 637         }
 638         mtu -= hlen + sizeof(struct frag_hdr);
 639
 640         if (skb_has_frag_list(skb)) {
 641                 int first_len = skb_pagelen(skb);
 642                 struct sk_buff *frag2;
 643
 644                 if (first_len - hlen > mtu ||
 645                     ((first_len - hlen) & 7) ||
 646                     skb_cloned(skb))
 647                         goto slow_path;
 648
 649                 skb_walk_frags(skb, frag) {
 650                         /* Correct geometry. */
 651                         if (frag->len > mtu ||
 652                             ((frag->len & 7) && frag->next) ||
 653                             skb_headroom(frag) < hlen)
 654                                 goto slow_path_clean;
 655
 656                         /* Partially cloned skb? */
 657                         if (skb_shared(frag))
 658                                 goto slow_path_clean;
 659
 660                         BUG_ON(frag->sk);
 661                         if (skb->sk) {
 662                                 frag->sk = skb->sk;
 663                                 frag->destructor = sock_wfree;
 664                         }
 665                         skb->truesize -= frag->truesize;
 666                 }
 667
 668                 err = 0;
 669                 offset = 0;
 670                 frag = skb_shinfo(skb)->frag_list;
 671                 skb_frag_list_init(skb);
 672                 /* BUILD HEADER */
 673
 674                 *prevhdr = NEXTHDR_FRAGMENT;
 675                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 676                 if (!tmp_hdr) {
 677                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 678                                       IPSTATS_MIB_FRAGFAILS);
 679                         return -ENOMEM;
 680                 }
 681
 682                 __skb_pull(skb, hlen);
 683                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 684                 __skb_push(skb, hlen);
 685                 skb_reset_network_header(skb);
 686                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 687
 688                 ipv6_select_ident(fh);
 689                 fh->nexthdr = nexthdr;
 690                 fh->reserved = 0;
 691                 fh->frag_off = htons(IP6_MF);
 692                 frag_id = fh->identification;
 693
 694                 first_len = skb_pagelen(skb);
 695                 skb->data_len = first_len - skb_headlen(skb);
 696                 skb->len = first_len;
 697                 ipv6_hdr(skb)->payload_len = htons(first_len -
 698                                                    sizeof(struct ipv6hdr));
 699
 700                 dst_hold(&rt->dst);
 701
 702                 for (;;) {
 703                         /* Prepare header of the next frame,
 704                          * before previous one went down. */
 705                         if (frag) {
 706                                 frag->ip_summed = CHECKSUM_NONE;
 707                                 skb_reset_transport_header(frag);
 708                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 709                                 __skb_push(frag, hlen);
 710                                 skb_reset_network_header(frag);
 711                                 memcpy(skb_network_header(frag), tmp_hdr,
 712                                        hlen);
 713                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 714                                 fh->nexthdr = nexthdr;
 715                                 fh->reserved = 0;
 716                                 fh->frag_off = htons(offset);
 717                                 if (frag->next != NULL)
 718                                         fh->frag_off |= htons(IP6_MF);
 719                                 fh->identification = frag_id;
 720                                 ipv6_hdr(frag)->payload_len =
 721                                                 htons(frag->len -
 722                                                       sizeof(struct ipv6hdr));
 723                                 ip6_copy_metadata(frag, skb);
 724                         }
 725
 726                         err = output(skb);
 727                         if(!err)
 728                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 729                                               IPSTATS_MIB_FRAGCREATES);
 730
 731                         if (err || !frag)
 732                                 break;
 733
 734                         skb = frag;
 735                         frag = skb->next;
 736                         skb->next = NULL;
 737                 }
 738
 739                 kfree(tmp_hdr);
 740
 741                 if (err == 0) {
 742                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 743                                       IPSTATS_MIB_FRAGOKS);
 744                         dst_release(&rt->dst);
 745                         return 0;
 746                 }
 747
 748                 while (frag) {
 749                         skb = frag->next;
 750                         kfree_skb(frag);
 751                         frag = skb;
 752                 }
 753
 754                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 755                               IPSTATS_MIB_FRAGFAILS);
 756                 dst_release(&rt->dst);
 757                 return err;
 758
 759 slow_path_clean:
 760                 skb_walk_frags(skb, frag2) {
 761                         if (frag2 == frag)
 762                                 break;
 763                         frag2->sk = NULL;
 764                         frag2->destructor = NULL;
 765                         skb->truesize += frag2->truesize;
 766                 }
 767         }
 768
 769 slow_path:
 770         left = skb->len - hlen;         /* Space per frame */
 771         ptr = hlen;                     /* Where to start from */
 772
 773         /*
 774          *      Fragment the datagram.
 775          */
 776
 777         *prevhdr = NEXTHDR_FRAGMENT;
 778
 779         /*
 780          *      Keep copying data until we run out.
 781          */
 782         while(left > 0) {
 783                 len = left;
 784                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 785                 if (len > mtu)
 786                         len = mtu;
 787                 /* IF: we are not sending upto and including the packet end
 788                    then align the next start on an eight byte boundary */
 789                 if (len < left) {
 790                         len &= ~7;
 791                 }
 792                 /*
 793                  *      Allocate buffer.
 794                  */
 795
 796                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
 797                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 798                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 799                                       IPSTATS_MIB_FRAGFAILS);
 800                         err = -ENOMEM;
 801                         goto fail;
 802                 }
 803
 804                 /*
 805                  *      Set up data on packet
 806                  */
 807
 808                 ip6_copy_metadata(frag, skb);
 809                 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
 810                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 811                 skb_reset_network_header(frag);
 812                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 813                 frag->transport_header = (frag->network_header + hlen +
 814                                           sizeof(struct frag_hdr));
 815
 816                 /*
 817                  *      Charge the memory for the fragment to any owner
 818                  *      it might possess
 819                  */
 820                 if (skb->sk)
 821                         skb_set_owner_w(frag, skb->sk);
 822
 823                 /*
 824                  *      Copy the packet header into the new buffer.
 825                  */
 826                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 827
 828                 /*
 829                  *      Build fragment header.
 830                  */
 831                 fh->nexthdr = nexthdr;
 832                 fh->reserved = 0;
 833                 if (!frag_id) {
 834                         ipv6_select_ident(fh);
 835                         frag_id = fh->identification;
 836                 } else
 837                         fh->identification = frag_id;
 838
 839                 /*
 840                  *      Copy a block of the IP datagram.
 841                  */
 842                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 843                         BUG();
 844                 left -= len;
 845
 846                 fh->frag_off = htons(offset);
 847                 if (left > 0)
 848                         fh->frag_off |= htons(IP6_MF);
 849                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 850                                                     sizeof(struct ipv6hdr));
 851
 852                 ptr += len;
 853                 offset += len;
 854
 855                 /*
 856                  *      Put this fragment into the sending queue.
 857                  */
 858                 err = output(frag);
 859                 if (err)
 860                         goto fail;
 861
 862                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 863                               IPSTATS_MIB_FRAGCREATES);
 864         }
 865         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 866                       IPSTATS_MIB_FRAGOKS);
 867         kfree_skb(skb);
 868         return err;
 869
 870 fail:
 871         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 872                       IPSTATS_MIB_FRAGFAILS);
 873         kfree_skb(skb);
 874         return err;
 875 }
 876
 877 static inline int ip6_rt_check(struct rt6key *rt_key,
 878                                struct in6_addr *fl_addr,
 879                                struct in6_addr *addr_cache)
 880 {
 881         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 882                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 883 }
 884
 885 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 886                                           struct dst_entry *dst,
 887                                           struct flowi *fl)
 888 {
 889         struct ipv6_pinfo *np = inet6_sk(sk);
 890         struct rt6_info *rt = (struct rt6_info *)dst;
 891
 892         if (!dst)
 893                 goto out;
 894
 895         /* Yes, checking route validity in not connected
 896          * case is not very simple. Take into account,
 897          * that we do not support routing by source, TOS,
 898          * and MSG_DONTROUTE            --ANK (980726)
 899          *
 900          * 1. ip6_rt_check(): If route was host route,
 901          *    check that cached destination is current.
 902          *    If it is network route, we still may
 903          *    check its validity using saved pointer
 904          *    to the last used address: daddr_cache.
 905          *    We do not want to save whole address now,
 906          *    (because main consumer of this service
 907          *    is tcp, which has not this problem),
 908          *    so that the last trick works only on connected
 909          *    sockets.
 910          * 2. oif also should be the same.
 911          */
 912         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 913 #ifdef CONFIG_IPV6_SUBTREES
 914             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 915 #endif
 916             (fl->oif && fl->oif != dst->dev->ifindex)) {
 917                 dst_release(dst);
 918                 dst = NULL;
 919         }
 920
 921 out:
 922         return dst;
 923 }
 924
 925 static int ip6_dst_lookup_tail(struct sock *sk,
 926                                struct dst_entry **dst, struct flowi *fl)
 927 {
 928         int err;
 929         struct net *net = sock_net(sk);
 930
 931         if (*dst == NULL)
 932                 *dst = ip6_route_output(net, sk, fl);
 933
 934         if ((err = (*dst)->error))
 935                 goto out_err_release;
 936
 937         if (ipv6_addr_any(&fl->fl6_src)) {
 938                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
 939                                          &fl->fl6_dst,
 940                                          sk ? inet6_sk(sk)->srcprefs : 0,
 941                                          &fl->fl6_src);
 942                 if (err)
 943                         goto out_err_release;
 944         }
 945
 946 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 947         /*
 948          * Here if the dst entry we've looked up
 949          * has a neighbour entry that is in the INCOMPLETE
 950          * state and the src address from the flow is
 951          * marked as OPTIMISTIC, we release the found
 952          * dst entry and replace it instead with the
 953          * dst entry of the nexthop router
 954          */
 955         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
 956                 struct inet6_ifaddr *ifp;
 957                 struct flowi fl_gw;
 958                 int redirect;
 959
 960                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 961                                       (*dst)->dev, 1);
 962
 963                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 964                 if (ifp)
 965                         in6_ifa_put(ifp);
 966
 967                 if (redirect) {
 968                         /*
 969                          * We need to get the dst entry for the
 970                          * default router instead
 971                          */
 972                         dst_release(*dst);
 973                         memcpy(&fl_gw, fl, sizeof(struct flowi));
 974                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 975                         *dst = ip6_route_output(net, sk, &fl_gw);
 976                         if ((err = (*dst)->error))
 977                                 goto out_err_release;
 978                 }
 979         }
 980 #endif
 981
 982         return 0;
 983
 984 out_err_release:
 985         if (err == -ENETUNREACH)
 986                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 987         dst_release(*dst);
 988         *dst = NULL;
 989         return err;
 990 }
 991
 992 /**
 993  *      ip6_dst_lookup - perform route lookup on flow
 994  *      @sk: socket which provides route info
 995  *      @dst: pointer to dst_entry * for result
 996  *      @fl: flow to lookup
 997  *
 998  *      This function performs a route lookup on the given flow.
 999  *
1000  *      It returns zero on success, or a standard errno code on error.
1001  */
1002 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1003 {
1004         *dst = NULL;
1005         return ip6_dst_lookup_tail(sk, dst, fl);
1006 }
1007 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1008
1009 /**
1010  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1011  *      @sk: socket which provides the dst cache and route info
1012  *      @dst: pointer to dst_entry * for result
1013  *      @fl: flow to lookup
1014  *
1015  *      This function performs a route lookup on the given flow with the
1016  *      possibility of using the cached route in the socket if it is valid.
1017  *      It will take the socket dst lock when operating on the dst cache.
1018  *      As a result, this function can only be used in process context.
1019  *
1020  *      It returns zero on success, or a standard errno code on error.
1021  */
1022 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1023 {
1024         *dst = NULL;
1025         if (sk) {
1026                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1027                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1028         }
1029
1030         return ip6_dst_lookup_tail(sk, dst, fl);
1031 }
1032 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1033
1034 static inline int ip6_ufo_append_data(struct sock *sk,
1035                         int getfrag(void *from, char *to, int offset, int len,
1036                         int odd, struct sk_buff *skb),
1037                         void *from, int length, int hh_len, int fragheaderlen,
1038                         int transhdrlen, int mtu,unsigned int flags)
1039
1040 {
1041         struct sk_buff *skb;
1042         int err;
1043
1044         /* There is support for UDP large send offload by network
1045          * device, so create one single skb packet containing complete
1046          * udp datagram
1047          */
1048         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1049                 skb = sock_alloc_send_skb(sk,
1050                         hh_len + fragheaderlen + transhdrlen + 20,
1051                         (flags & MSG_DONTWAIT), &err);
1052                 if (skb == NULL)
1053                         return -ENOMEM;
1054
1055                 /* reserve space for Hardware header */
1056                 skb_reserve(skb, hh_len);
1057
1058                 /* create space for UDP/IP header */
1059                 skb_put(skb,fragheaderlen + transhdrlen);
1060
1061                 /* initialize network header pointer */
1062                 skb_reset_network_header(skb);
1063
1064                 /* initialize protocol header pointer */
1065                 skb->transport_header = skb->network_header + fragheaderlen;
1066
1067                 skb->ip_summed = CHECKSUM_PARTIAL;
1068                 skb->csum = 0;
1069                 sk->sk_sndmsg_off = 0;
1070         }
1071
1072         err = skb_append_datato_frags(sk,skb, getfrag, from,
1073                                       (length - transhdrlen));
1074         if (!err) {
1075                 struct frag_hdr fhdr;
1076
1077                 /* Specify the length of each IPv6 datagram fragment.
1078                  * It has to be a multiple of 8.
1079                  */
1080                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1081                                              sizeof(struct frag_hdr)) & ~7;
1082                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1083                 ipv6_select_ident(&fhdr);
1084                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1085                 __skb_queue_tail(&sk->sk_write_queue, skb);
1086
1087                 return 0;
1088         }
1089         /* There is not enough support do UPD LSO,
1090          * so follow normal path
1091          */
1092         kfree_skb(skb);
1093
1094         return err;
1095 }
1096
1097 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1098                                                gfp_t gfp)
1099 {
1100         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1101 }
1102
1103 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1104                                                 gfp_t gfp)
1105 {
1106         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1107 }
1108
1109 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1110         int offset, int len, int odd, struct sk_buff *skb),
1111         void *from, int length, int transhdrlen,
1112         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1113         struct rt6_info *rt, unsigned int flags, int dontfrag)
1114 {
1115         struct inet_sock *inet = inet_sk(sk);
1116         struct ipv6_pinfo *np = inet6_sk(sk);
1117         struct sk_buff *skb;
1118         unsigned int maxfraglen, fragheaderlen;
1119         int exthdrlen;
1120         int hh_len;
1121         int mtu;
1122         int copy;
1123         int err;
1124         int offset = 0;
1125         int csummode = CHECKSUM_NONE;
1126
1127         if (flags&MSG_PROBE)
1128                 return 0;
1129         if (skb_queue_empty(&sk->sk_write_queue)) {
1130                 /*
1131                  * setup for corking
1132                  */
1133                 if (opt) {
1134                         if (WARN_ON(np->cork.opt))
1135                                 return -EINVAL;
1136
1137                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1138                         if (unlikely(np->cork.opt == NULL))
1139                                 return -ENOBUFS;
1140
1141                         np->cork.opt->tot_len = opt->tot_len;
1142                         np->cork.opt->opt_flen = opt->opt_flen;
1143                         np->cork.opt->opt_nflen = opt->opt_nflen;
1144
1145                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1146                                                             sk->sk_allocation);
1147                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1148                                 return -ENOBUFS;
1149
1150                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1151                                                             sk->sk_allocation);
1152                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1153                                 return -ENOBUFS;
1154
1155                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1156                                                            sk->sk_allocation);
1157                         if (opt->hopopt && !np->cork.opt->hopopt)
1158                                 return -ENOBUFS;
1159
1160                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1161                                                             sk->sk_allocation);
1162                         if (opt->srcrt && !np->cork.opt->srcrt)
1163                                 return -ENOBUFS;
1164
1165                         /* need source address above miyazawa*/
1166                 }
1167                 dst_hold(&rt->dst);
1168                 inet->cork.dst = &rt->dst;
1169                 inet->cork.fl = *fl;
1170                 np->cork.hop_limit = hlimit;
1171                 np->cork.tclass = tclass;
1172                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1173                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1174                 if (np->frag_size < mtu) {
1175                         if (np->frag_size)
1176                                 mtu = np->frag_size;
1177                 }
1178                 inet->cork.fragsize = mtu;
1179                 if (dst_allfrag(rt->dst.path))
1180                         inet->cork.flags |= IPCORK_ALLFRAG;
1181                 inet->cork.length = 0;
1182                 sk->sk_sndmsg_page = NULL;
1183                 sk->sk_sndmsg_off = 0;
1184                 exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1185                             rt->rt6i_nfheader_len;
1186                 length += exthdrlen;
1187                 transhdrlen += exthdrlen;
1188         } else {
1189                 rt = (struct rt6_info *)inet->cork.dst;
1190                 fl = &inet->cork.fl;
1191                 opt = np->cork.opt;
1192                 transhdrlen = 0;
1193                 exthdrlen = 0;
1194                 mtu = inet->cork.fragsize;
1195         }
1196
1197         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1198
1199         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1200                         (opt ? opt->opt_nflen : 0);
1201         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1202
1203         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1204                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1205                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1206                         return -EMSGSIZE;
1207                 }
1208         }
1209
1210         /*
1211          * Let's try using as much space as possible.
1212          * Use MTU if total length of the message fits into the MTU.
1213          * Otherwise, we need to reserve fragment header and
1214          * fragment alignment (= 8-15 octects, in total).
1215          *
1216          * Note that we may need to "move" the data from the tail of
1217          * of the buffer to the new fragment when we split
1218          * the message.
1219          *
1220          * FIXME: It may be fragmented into multiple chunks
1221          *        at once if non-fragmentable extension headers
1222          *        are too large.
1223          * --yoshfuji
1224          */
1225
1226         inet->cork.length += length;
1227         if (length > mtu) {
1228                 int proto = sk->sk_protocol;
1229                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1230                         ipv6_local_rxpmtu(sk, fl, mtu-exthdrlen);
1231                         return -EMSGSIZE;
1232                 }
1233
1234                 if (proto == IPPROTO_UDP &&
1235                     (rt->dst.dev->features & NETIF_F_UFO)) {
1236
1237                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1238                                                   hh_len, fragheaderlen,
1239                                                   transhdrlen, mtu, flags);
1240                         if (err)
1241                                 goto error;
1242                         return 0;
1243                 }
1244         }
1245
1246         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1247                 goto alloc_new_skb;
1248
1249         while (length > 0) {
1250                 /* Check if the remaining data fits into current packet. */
1251                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1252                 if (copy < length)
1253                         copy = maxfraglen - skb->len;
1254
1255                 if (copy <= 0) {
1256                         char *data;
1257                         unsigned int datalen;
1258                         unsigned int fraglen;
1259                         unsigned int fraggap;
1260                         unsigned int alloclen;
1261                         struct sk_buff *skb_prev;
1262 alloc_new_skb:
1263                         skb_prev = skb;
1264
1265                         /* There's no room in the current skb */
1266                         if (skb_prev)
1267                                 fraggap = skb_prev->len - maxfraglen;
1268                         else
1269                                 fraggap = 0;
1270
1271                         /*
1272                          * If remaining data exceeds the mtu,
1273                          * we know we need more fragment(s).
1274                          */
1275                         datalen = length + fraggap;
1276                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1277                                 datalen = maxfraglen - fragheaderlen;
1278
1279                         fraglen = datalen + fragheaderlen;
1280                         if ((flags & MSG_MORE) &&
1281                             !(rt->dst.dev->features&NETIF_F_SG))
1282                                 alloclen = mtu;
1283                         else
1284                                 alloclen = datalen + fragheaderlen;
1285
1286                         /*
1287                          * The last fragment gets additional space at tail.
1288                          * Note: we overallocate on fragments with MSG_MODE
1289                          * because we have no idea if we're the last one.
1290                          */
1291                         if (datalen == length + fraggap)
1292                                 alloclen += rt->dst.trailer_len;
1293
1294                         /*
1295                          * We just reserve space for fragment header.
1296                          * Note: this may be overallocation if the message
1297                          * (without MSG_MORE) fits into the MTU.
1298                          */
1299                         alloclen += sizeof(struct frag_hdr);
1300
1301                         if (transhdrlen) {
1302                                 skb = sock_alloc_send_skb(sk,
1303                                                 alloclen + hh_len,
1304                                                 (flags & MSG_DONTWAIT), &err);
1305                         } else {
1306                                 skb = NULL;
1307                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1308                                     2 * sk->sk_sndbuf)
1309                                         skb = sock_wmalloc(sk,
1310                                                            alloclen + hh_len, 1,
1311                                                            sk->sk_allocation);
1312                                 if (unlikely(skb == NULL))
1313                                         err = -ENOBUFS;
1314                         }
1315                         if (skb == NULL)
1316                                 goto error;
1317                         /*
1318                          *      Fill in the control structures
1319                          */
1320                         skb->ip_summed = csummode;
1321                         skb->csum = 0;
1322                         /* reserve for fragmentation */
1323                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1324
1325                         /*
1326                          *      Find where to start putting bytes
1327                          */
1328                         data = skb_put(skb, fraglen);
1329                         skb_set_network_header(skb, exthdrlen);
1330                         data += fragheaderlen;
1331                         skb->transport_header = (skb->network_header +
1332                                                  fragheaderlen);
1333                         if (fraggap) {
1334                                 skb->csum = skb_copy_and_csum_bits(
1335                                         skb_prev, maxfraglen,
1336                                         data + transhdrlen, fraggap, 0);
1337                                 skb_prev->csum = csum_sub(skb_prev->csum,
1338                                                           skb->csum);
1339                                 data += fraggap;
1340                                 pskb_trim_unique(skb_prev, maxfraglen);
1341                         }
1342                         copy = datalen - transhdrlen - fraggap;
1343                         if (copy < 0) {
1344                                 err = -EINVAL;
1345                                 kfree_skb(skb);
1346                                 goto error;
1347                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1348                                 err = -EFAULT;
1349                                 kfree_skb(skb);
1350                                 goto error;
1351                         }
1352
1353                         offset += copy;
1354                         length -= datalen - fraggap;
1355                         transhdrlen = 0;
1356                         exthdrlen = 0;
1357                         csummode = CHECKSUM_NONE;
1358
1359                         /*
1360                          * Put the packet on the pending queue
1361                          */
1362                         __skb_queue_tail(&sk->sk_write_queue, skb);
1363                         continue;
1364                 }
1365
1366                 if (copy > length)
1367                         copy = length;
1368
1369                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1370                         unsigned int off;
1371
1372                         off = skb->len;
1373                         if (getfrag(from, skb_put(skb, copy),
1374                                                 offset, copy, off, skb) < 0) {
1375                                 __skb_trim(skb, off);
1376                                 err = -EFAULT;
1377                                 goto error;
1378                         }
1379                 } else {
1380                         int i = skb_shinfo(skb)->nr_frags;
1381                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1382                         struct page *page = sk->sk_sndmsg_page;
1383                         int off = sk->sk_sndmsg_off;
1384                         unsigned int left;
1385
1386                         if (page && (left = PAGE_SIZE - off) > 0) {
1387                                 if (copy >= left)
1388                                         copy = left;
1389                                 if (page != frag->page) {
1390                                         if (i == MAX_SKB_FRAGS) {
1391                                                 err = -EMSGSIZE;
1392                                                 goto error;
1393                                         }
1394                                         get_page(page);
1395                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1396                                         frag = &skb_shinfo(skb)->frags[i];
1397                                 }
1398                         } else if(i < MAX_SKB_FRAGS) {
1399                                 if (copy > PAGE_SIZE)
1400                                         copy = PAGE_SIZE;
1401                                 page = alloc_pages(sk->sk_allocation, 0);
1402                                 if (page == NULL) {
1403                                         err = -ENOMEM;
1404                                         goto error;
1405                                 }
1406                                 sk->sk_sndmsg_page = page;
1407                                 sk->sk_sndmsg_off = 0;
1408
1409                                 skb_fill_page_desc(skb, i, page, 0, 0);
1410                                 frag = &skb_shinfo(skb)->frags[i];
1411                         } else {
1412                                 err = -EMSGSIZE;
1413                                 goto error;
1414                         }
1415                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1416                                 err = -EFAULT;
1417                                 goto error;
1418                         }
1419                         sk->sk_sndmsg_off += copy;
1420                         frag->size += copy;
1421                         skb->len += copy;
1422                         skb->data_len += copy;
1423                         skb->truesize += copy;
1424                         atomic_add(copy, &sk->sk_wmem_alloc);
1425                 }
1426                 offset += copy;
1427                 length -= copy;
1428         }
1429         return 0;
1430 error:
1431         inet->cork.length -= length;
1432         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1433         return err;
1434 }
1435
1436 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1437 {
1438         if (np->cork.opt) {
1439                 kfree(np->cork.opt->dst0opt);
1440                 kfree(np->cork.opt->dst1opt);
1441                 kfree(np->cork.opt->hopopt);
1442                 kfree(np->cork.opt->srcrt);
1443                 kfree(np->cork.opt);
1444                 np->cork.opt = NULL;
1445         }
1446
1447         if (inet->cork.dst) {
1448                 dst_release(inet->cork.dst);
1449                 inet->cork.dst = NULL;
1450                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1451         }
1452         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1453 }
1454
1455 int ip6_push_pending_frames(struct sock *sk)
1456 {
1457         struct sk_buff *skb, *tmp_skb;
1458         struct sk_buff **tail_skb;
1459         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1460         struct inet_sock *inet = inet_sk(sk);
1461         struct ipv6_pinfo *np = inet6_sk(sk);
1462         struct net *net = sock_net(sk);
1463         struct ipv6hdr *hdr;
1464         struct ipv6_txoptions *opt = np->cork.opt;
1465         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1466         struct flowi *fl = &inet->cork.fl;
1467         unsigned char proto = fl->proto;
1468         int err = 0;
1469
1470         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1471                 goto out;
1472         tail_skb = &(skb_shinfo(skb)->frag_list);
1473
1474         /* move skb->data to ip header from ext header */
1475         if (skb->data < skb_network_header(skb))
1476                 __skb_pull(skb, skb_network_offset(skb));
1477         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1478                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1479                 *tail_skb = tmp_skb;
1480                 tail_skb = &(tmp_skb->next);
1481                 skb->len += tmp_skb->len;
1482                 skb->data_len += tmp_skb->len;
1483                 skb->truesize += tmp_skb->truesize;
1484                 tmp_skb->destructor = NULL;
1485                 tmp_skb->sk = NULL;
1486         }
1487
1488         /* Allow local fragmentation. */
1489         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1490                 skb->local_df = 1;
1491
1492         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1493         __skb_pull(skb, skb_network_header_len(skb));
1494         if (opt && opt->opt_flen)
1495                 ipv6_push_frag_opts(skb, opt, &proto);
1496         if (opt && opt->opt_nflen)
1497                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1498
1499         skb_push(skb, sizeof(struct ipv6hdr));
1500         skb_reset_network_header(skb);
1501         hdr = ipv6_hdr(skb);
1502
1503         *(__be32*)hdr = fl->fl6_flowlabel |
1504                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1505
1506         hdr->hop_limit = np->cork.hop_limit;
1507         hdr->nexthdr = proto;
1508         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1509         ipv6_addr_copy(&hdr->daddr, final_dst);
1510
1511         skb->priority = sk->sk_priority;
1512         skb->mark = sk->sk_mark;
1513
1514         skb_dst_set(skb, dst_clone(&rt->dst));
1515         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1516         if (proto == IPPROTO_ICMPV6) {
1517                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1518
1519                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1520                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1521         }
1522
1523         err = ip6_local_out(skb);
1524         if (err) {
1525                 if (err > 0)
1526                         err = net_xmit_errno(err);
1527                 if (err)
1528                         goto error;
1529         }
1530
1531 out:
1532         ip6_cork_release(inet, np);
1533         return err;
1534 error:
1535         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1536         goto out;
1537 }
1538
1539 void ip6_flush_pending_frames(struct sock *sk)
1540 {
1541         struct sk_buff *skb;
1542
1543         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1544                 if (skb_dst(skb))
1545                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1546                                       IPSTATS_MIB_OUTDISCARDS);
1547                 kfree_skb(skb);
1548         }
1549
1550         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1551 }