net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *      Based on linux/net/ipv4/ip_output.c
  11  *
  12  *      This program is free software; you can redistribute it and/or
  13  *      modify it under the terms of the GNU General Public License
  14  *      as published by the Free Software Foundation; either version
  15  *      2 of the License, or (at your option) any later version.
  16  *
  17  *      Changes:
  18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  19  *                              extension headers are implemented.
  20  *                              route changes now work.
  21  *                              ip6_forward does not confuse sniffers.
  22  *                              etc.
  23  *
  24  *      H. von Brand    :       Added missing #include <linux/string.h>
  25  *      Imran Patel     :       frag id should be in NBO
  26  *      Kazunori MIYAZAWA @USAGI
  27  *                      :       add ip6_append_data and related functions
  28  *                              for datagram xmit
  29  */
  30
  31 #include <linux/errno.h>
  32 #include <linux/kernel.h>
  33 #include <linux/string.h>
  34 #include <linux/socket.h>
  35 #include <linux/net.h>
  36 #include <linux/netdevice.h>
  37 #include <linux/if_arp.h>
  38 #include <linux/in6.h>
  39 #include <linux/tcp.h>
  40 #include <linux/route.h>
  41 #include <linux/module.h>
  42
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58
  59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  62 {
  63         static u32 ipv6_fragmentation_id = 1;
  64         static DEFINE_SPINLOCK(ip6_id_lock);
  65
  66         spin_lock_bh(&ip6_id_lock);
  67         fhdr->identification = htonl(ipv6_fragmentation_id);
  68         if (++ipv6_fragmentation_id == 0)
  69                 ipv6_fragmentation_id = 1;
  70         spin_unlock_bh(&ip6_id_lock);
  71 }
  72
  73 int __ip6_local_out(struct sk_buff *skb)
  74 {
  75         int len;
  76
  77         len = skb->len - sizeof(struct ipv6hdr);
  78         if (len > IPV6_MAXPLEN)
  79                 len = 0;
  80         ipv6_hdr(skb)->payload_len = htons(len);
  81
  82         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
  83                        dst_output);
  84 }
  85
  86 int ip6_local_out(struct sk_buff *skb)
  87 {
  88         int err;
  89
  90         err = __ip6_local_out(skb);
  91         if (likely(err == 1))
  92                 err = dst_output(skb);
  93
  94         return err;
  95 }
  96 EXPORT_SYMBOL_GPL(ip6_local_out);
  97
  98 static int ip6_output_finish(struct sk_buff *skb)
  99 {
 100         struct dst_entry *dst = skb->dst;
 101
 102         if (dst->hh)
 103                 return neigh_hh_output(dst->hh, skb);
 104         else if (dst->neighbour)
 105                 return dst->neighbour->output(skb);
 106
 107         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 108         kfree_skb(skb);
 109         return -EINVAL;
 110
 111 }
 112
 113 /* dev_loopback_xmit for use with netfilter. */
 114 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 115 {
 116         skb_reset_mac_header(newskb);
 117         __skb_pull(newskb, skb_network_offset(newskb));
 118         newskb->pkt_type = PACKET_LOOPBACK;
 119         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 120         BUG_TRAP(newskb->dst);
 121
 122         netif_rx(newskb);
 123         return 0;
 124 }
 125
 126
 127 static int ip6_output2(struct sk_buff *skb)
 128 {
 129         struct dst_entry *dst = skb->dst;
 130         struct net_device *dev = dst->dev;
 131
 132         skb->protocol = htons(ETH_P_IPV6);
 133         skb->dev = dev;
 134
 135         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 136                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 137                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 138
 139                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 140                     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 141                                         &ipv6_hdr(skb)->saddr)) {
 142                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 143
 144                         /* Do not check for IFF_ALLMULTI; multicast routing
 145                            is not supported in any case.
 146                          */
 147                         if (newskb)
 148                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 149                                         NULL, newskb->dev,
 150                                         ip6_dev_loopback_xmit);
 151
 152                         if (ipv6_hdr(skb)->hop_limit == 0) {
 153                                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
 154                                 kfree_skb(skb);
 155                                 return 0;
 156                         }
 157                 }
 158
 159                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
 160         }
 161
 162         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 163                        ip6_output_finish);
 164 }
 165
 166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 167 {
 168         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 169
 170         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 171                skb->dst->dev->mtu : dst_mtu(skb->dst);
 172 }
 173
 174 int ip6_output(struct sk_buff *skb)
 175 {
 176         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 177                                 dst_allfrag(skb->dst))
 178                 return ip6_fragment(skb, ip6_output2);
 179         else
 180                 return ip6_output2(skb);
 181 }
 182
 183 /*
 184  *      xmit an sk_buff (used by TCP)
 185  */
 186
 187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 188              struct ipv6_txoptions *opt, int ipfragok)
 189 {
 190         struct ipv6_pinfo *np = inet6_sk(sk);
 191         struct in6_addr *first_hop = &fl->fl6_dst;
 192         struct dst_entry *dst = skb->dst;
 193         struct ipv6hdr *hdr;
 194         u8  proto = fl->proto;
 195         int seg_len = skb->len;
 196         int hlimit, tclass;
 197         u32 mtu;
 198
 199         if (opt) {
 200                 unsigned int head_room;
 201
 202                 /* First: exthdrs may take lots of space (~8K for now)
 203                    MAX_HEADER is not enough.
 204                  */
 205                 head_room = opt->opt_nflen + opt->opt_flen;
 206                 seg_len += head_room;
 207                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 208
 209                 if (skb_headroom(skb) < head_room) {
 210                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 211                         if (skb2 == NULL) {
 212                                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
 213                                               IPSTATS_MIB_OUTDISCARDS);
 214                                 kfree_skb(skb);
 215                                 return -ENOBUFS;
 216                         }
 217                         kfree_skb(skb);
 218                         skb = skb2;
 219                         if (sk)
 220                                 skb_set_owner_w(skb, sk);
 221                 }
 222                 if (opt->opt_flen)
 223                         ipv6_push_frag_opts(skb, opt, &proto);
 224                 if (opt->opt_nflen)
 225                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 226         }
 227
 228         skb_push(skb, sizeof(struct ipv6hdr));
 229         skb_reset_network_header(skb);
 230         hdr = ipv6_hdr(skb);
 231
 232         /*
 233          *      Fill in the IPv6 header
 234          */
 235
 236         hlimit = -1;
 237         if (np)
 238                 hlimit = np->hop_limit;
 239         if (hlimit < 0)
 240                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
 241         if (hlimit < 0)
 242                 hlimit = ipv6_get_hoplimit(dst->dev);
 243
 244         tclass = -1;
 245         if (np)
 246                 tclass = np->tclass;
 247         if (tclass < 0)
 248                 tclass = 0;
 249
 250         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 251
 252         hdr->payload_len = htons(seg_len);
 253         hdr->nexthdr = proto;
 254         hdr->hop_limit = hlimit;
 255
 256         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 257         ipv6_addr_copy(&hdr->daddr, first_hop);
 258
 259         skb->priority = sk->sk_priority;
 260
 261         mtu = dst_mtu(dst);
 262         if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
 263                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
 264                               IPSTATS_MIB_OUTREQUESTS);
 265                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 266                                 dst_output);
 267         }
 268
 269         if (net_ratelimit())
 270                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 271         skb->dev = dst->dev;
 272         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 273         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 274         kfree_skb(skb);
 275         return -EMSGSIZE;
 276 }
 277
 278 EXPORT_SYMBOL(ip6_xmit);
 279
 280 /*
 281  *      To avoid extra problems ND packets are send through this
 282  *      routine. It's code duplication but I really want to avoid
 283  *      extra checks since ipv6_build_header is used by TCP (which
 284  *      is for us performance critical)
 285  */
 286
 287 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 288                struct in6_addr *saddr, struct in6_addr *daddr,
 289                int proto, int len)
 290 {
 291         struct ipv6_pinfo *np = inet6_sk(sk);
 292         struct ipv6hdr *hdr;
 293         int totlen;
 294
 295         skb->protocol = htons(ETH_P_IPV6);
 296         skb->dev = dev;
 297
 298         totlen = len + sizeof(struct ipv6hdr);
 299
 300         skb_reset_network_header(skb);
 301         skb_put(skb, sizeof(struct ipv6hdr));
 302         hdr = ipv6_hdr(skb);
 303
 304         *(__be32*)hdr = htonl(0x60000000);
 305
 306         hdr->payload_len = htons(len);
 307         hdr->nexthdr = proto;
 308         hdr->hop_limit = np->hop_limit;
 309
 310         ipv6_addr_copy(&hdr->saddr, saddr);
 311         ipv6_addr_copy(&hdr->daddr, daddr);
 312
 313         return 0;
 314 }
 315
 316 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 317 {
 318         struct ip6_ra_chain *ra;
 319         struct sock *last = NULL;
 320
 321         read_lock(&ip6_ra_lock);
 322         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 323                 struct sock *sk = ra->sk;
 324                 if (sk && ra->sel == sel &&
 325                     (!sk->sk_bound_dev_if ||
 326                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 327                         if (last) {
 328                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 329                                 if (skb2)
 330                                         rawv6_rcv(last, skb2);
 331                         }
 332                         last = sk;
 333                 }
 334         }
 335
 336         if (last) {
 337                 rawv6_rcv(last, skb);
 338                 read_unlock(&ip6_ra_lock);
 339                 return 1;
 340         }
 341         read_unlock(&ip6_ra_lock);
 342         return 0;
 343 }
 344
 345 static int ip6_forward_proxy_check(struct sk_buff *skb)
 346 {
 347         struct ipv6hdr *hdr = ipv6_hdr(skb);
 348         u8 nexthdr = hdr->nexthdr;
 349         int offset;
 350
 351         if (ipv6_ext_hdr(nexthdr)) {
 352                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 353                 if (offset < 0)
 354                         return 0;
 355         } else
 356                 offset = sizeof(struct ipv6hdr);
 357
 358         if (nexthdr == IPPROTO_ICMPV6) {
 359                 struct icmp6hdr *icmp6;
 360
 361                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 362                                          offset + 1 - skb->data)))
 363                         return 0;
 364
 365                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 366
 367                 switch (icmp6->icmp6_type) {
 368                 case NDISC_ROUTER_SOLICITATION:
 369                 case NDISC_ROUTER_ADVERTISEMENT:
 370                 case NDISC_NEIGHBOUR_SOLICITATION:
 371                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 372                 case NDISC_REDIRECT:
 373                         /* For reaction involving unicast neighbor discovery
 374                          * message destined to the proxied address, pass it to
 375                          * input function.
 376                          */
 377                         return 1;
 378                 default:
 379                         break;
 380                 }
 381         }
 382
 383         /*
 384          * The proxying router can't forward traffic sent to a link-local
 385          * address, so signal the sender and discard the packet. This
 386          * behavior is clarified by the MIPv6 specification.
 387          */
 388         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 389                 dst_link_failure(skb);
 390                 return -1;
 391         }
 392
 393         return 0;
 394 }
 395
 396 static inline int ip6_forward_finish(struct sk_buff *skb)
 397 {
 398         return dst_output(skb);
 399 }
 400
 401 int ip6_forward(struct sk_buff *skb)
 402 {
 403         struct dst_entry *dst = skb->dst;
 404         struct ipv6hdr *hdr = ipv6_hdr(skb);
 405         struct inet6_skb_parm *opt = IP6CB(skb);
 406
 407         if (ipv6_devconf.forwarding == 0)
 408                 goto error;
 409
 410         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 411                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 412                 goto drop;
 413         }
 414
 415         skb_forward_csum(skb);
 416
 417         /*
 418          *      We DO NOT make any processing on
 419          *      RA packets, pushing them to user level AS IS
 420          *      without ane WARRANTY that application will be able
 421          *      to interpret them. The reason is that we
 422          *      cannot make anything clever here.
 423          *
 424          *      We are not end-node, so that if packet contains
 425          *      AH/ESP, we cannot make anything.
 426          *      Defragmentation also would be mistake, RA packets
 427          *      cannot be fragmented, because there is no warranty
 428          *      that different fragments will go along one path. --ANK
 429          */
 430         if (opt->ra) {
 431                 u8 *ptr = skb_network_header(skb) + opt->ra;
 432                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 433                         return 0;
 434         }
 435
 436         /*
 437          *      check and decrement ttl
 438          */
 439         if (hdr->hop_limit <= 1) {
 440                 /* Force OUTPUT device used as source address */
 441                 skb->dev = dst->dev;
 442                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 443                             0, skb->dev);
 444                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 445
 446                 kfree_skb(skb);
 447                 return -ETIMEDOUT;
 448         }
 449
 450         /* XXX: idev->cnf.proxy_ndp? */
 451         if (ipv6_devconf.proxy_ndp &&
 452             pneigh_lookup(&nd_tbl, &init_net, &hdr->daddr, skb->dev, 0)) {
 453                 int proxied = ip6_forward_proxy_check(skb);
 454                 if (proxied > 0)
 455                         return ip6_input(skb);
 456                 else if (proxied < 0) {
 457                         IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 458                         goto drop;
 459                 }
 460         }
 461
 462         if (!xfrm6_route_forward(skb)) {
 463                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 464                 goto drop;
 465         }
 466         dst = skb->dst;
 467
 468         /* IPv6 specs say nothing about it, but it is clear that we cannot
 469            send redirects to source routed frames.
 470            We don't send redirects to frames decapsulated from IPsec.
 471          */
 472         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 473             !skb->sp) {
 474                 struct in6_addr *target = NULL;
 475                 struct rt6_info *rt;
 476                 struct neighbour *n = dst->neighbour;
 477
 478                 /*
 479                  *      incoming and outgoing devices are the same
 480                  *      send a redirect.
 481                  */
 482
 483                 rt = (struct rt6_info *) dst;
 484                 if ((rt->rt6i_flags & RTF_GATEWAY))
 485                         target = (struct in6_addr*)&n->primary_key;
 486                 else
 487                         target = &hdr->daddr;
 488
 489                 /* Limit redirects both by destination (here)
 490                    and by source (inside ndisc_send_redirect)
 491                  */
 492                 if (xrlim_allow(dst, 1*HZ))
 493                         ndisc_send_redirect(skb, n, target);
 494         } else {
 495                 int addrtype = ipv6_addr_type(&hdr->saddr);
 496
 497                 /* This check is security critical. */
 498                 if (addrtype & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK))
 499                         goto error;
 500                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 501                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 502                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
 503                         goto error;
 504                 }
 505         }
 506
 507         if (skb->len > dst_mtu(dst)) {
 508                 /* Again, force OUTPUT device used as source address */
 509                 skb->dev = dst->dev;
 510                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 511                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 512                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 513                 kfree_skb(skb);
 514                 return -EMSGSIZE;
 515         }
 516
 517         if (skb_cow(skb, dst->dev->hard_header_len)) {
 518                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 519                 goto drop;
 520         }
 521
 522         hdr = ipv6_hdr(skb);
 523
 524         /* Mangling hops number delayed to point after skb COW */
 525
 526         hdr->hop_limit--;
 527
 528         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 529         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 530                        ip6_forward_finish);
 531
 532 error:
 533         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 534 drop:
 535         kfree_skb(skb);
 536         return -EINVAL;
 537 }
 538
 539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 540 {
 541         to->pkt_type = from->pkt_type;
 542         to->priority = from->priority;
 543         to->protocol = from->protocol;
 544         dst_release(to->dst);
 545         to->dst = dst_clone(from->dst);
 546         to->dev = from->dev;
 547         to->mark = from->mark;
 548
 549 #ifdef CONFIG_NET_SCHED
 550         to->tc_index = from->tc_index;
 551 #endif
 552         nf_copy(to, from);
 553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 554     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 555         to->nf_trace = from->nf_trace;
 556 #endif
 557         skb_copy_secmark(to, from);
 558 }
 559
 560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 561 {
 562         u16 offset = sizeof(struct ipv6hdr);
 563         struct ipv6_opt_hdr *exthdr =
 564                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 565         unsigned int packet_len = skb->tail - skb->network_header;
 566         int found_rhdr = 0;
 567         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 568
 569         while (offset + 1 <= packet_len) {
 570
 571                 switch (**nexthdr) {
 572
 573                 case NEXTHDR_HOP:
 574                         break;
 575                 case NEXTHDR_ROUTING:
 576                         found_rhdr = 1;
 577                         break;
 578                 case NEXTHDR_DEST:
 579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 580                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 581                                 break;
 582 #endif
 583                         if (found_rhdr)
 584                                 return offset;
 585                         break;
 586                 default :
 587                         return offset;
 588                 }
 589
 590                 offset += ipv6_optlen(exthdr);
 591                 *nexthdr = &exthdr->nexthdr;
 592                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 593                                                  offset);
 594         }
 595
 596         return offset;
 597 }
 598 EXPORT_SYMBOL_GPL(ip6_find_1stfragopt);
 599
 600 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 601 {
 602         struct net_device *dev;
 603         struct sk_buff *frag;
 604         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 605         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 606         struct ipv6hdr *tmp_hdr;
 607         struct frag_hdr *fh;
 608         unsigned int mtu, hlen, left, len;
 609         __be32 frag_id = 0;
 610         int ptr, offset = 0, err=0;
 611         u8 *prevhdr, nexthdr = 0;
 612
 613         dev = rt->u.dst.dev;
 614         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 615         nexthdr = *prevhdr;
 616
 617         mtu = ip6_skb_dst_mtu(skb);
 618
 619         /* We must not fragment if the socket is set to force MTU discovery
 620          * or if the skb it not generated by a local socket.  (This last
 621          * check should be redundant, but it's free.)
 622          */
 623         if (!np || np->pmtudisc >= IPV6_PMTUDISC_DO) {
 624                 skb->dev = skb->dst->dev;
 625                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 626                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 627                 kfree_skb(skb);
 628                 return -EMSGSIZE;
 629         }
 630
 631         if (np && np->frag_size < mtu) {
 632                 if (np->frag_size)
 633                         mtu = np->frag_size;
 634         }
 635         mtu -= hlen + sizeof(struct frag_hdr);
 636
 637         if (skb_shinfo(skb)->frag_list) {
 638                 int first_len = skb_pagelen(skb);
 639
 640                 if (first_len - hlen > mtu ||
 641                     ((first_len - hlen) & 7) ||
 642                     skb_cloned(skb))
 643                         goto slow_path;
 644
 645                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 646                         /* Correct geometry. */
 647                         if (frag->len > mtu ||
 648                             ((frag->len & 7) && frag->next) ||
 649                             skb_headroom(frag) < hlen)
 650                             goto slow_path;
 651
 652                         /* Partially cloned skb? */
 653                         if (skb_shared(frag))
 654                                 goto slow_path;
 655
 656                         BUG_ON(frag->sk);
 657                         if (skb->sk) {
 658                                 sock_hold(skb->sk);
 659                                 frag->sk = skb->sk;
 660                                 frag->destructor = sock_wfree;
 661                                 skb->truesize -= frag->truesize;
 662                         }
 663                 }
 664
 665                 err = 0;
 666                 offset = 0;
 667                 frag = skb_shinfo(skb)->frag_list;
 668                 skb_shinfo(skb)->frag_list = NULL;
 669                 /* BUILD HEADER */
 670
 671                 *prevhdr = NEXTHDR_FRAGMENT;
 672                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 673                 if (!tmp_hdr) {
 674                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 675                         return -ENOMEM;
 676                 }
 677
 678                 __skb_pull(skb, hlen);
 679                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 680                 __skb_push(skb, hlen);
 681                 skb_reset_network_header(skb);
 682                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 683
 684                 ipv6_select_ident(skb, fh);
 685                 fh->nexthdr = nexthdr;
 686                 fh->reserved = 0;
 687                 fh->frag_off = htons(IP6_MF);
 688                 frag_id = fh->identification;
 689
 690                 first_len = skb_pagelen(skb);
 691                 skb->data_len = first_len - skb_headlen(skb);
 692                 skb->len = first_len;
 693                 ipv6_hdr(skb)->payload_len = htons(first_len -
 694                                                    sizeof(struct ipv6hdr));
 695
 696                 dst_hold(&rt->u.dst);
 697
 698                 for (;;) {
 699                         /* Prepare header of the next frame,
 700                          * before previous one went down. */
 701                         if (frag) {
 702                                 frag->ip_summed = CHECKSUM_NONE;
 703                                 skb_reset_transport_header(frag);
 704                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 705                                 __skb_push(frag, hlen);
 706                                 skb_reset_network_header(frag);
 707                                 memcpy(skb_network_header(frag), tmp_hdr,
 708                                        hlen);
 709                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 710                                 fh->nexthdr = nexthdr;
 711                                 fh->reserved = 0;
 712                                 fh->frag_off = htons(offset);
 713                                 if (frag->next != NULL)
 714                                         fh->frag_off |= htons(IP6_MF);
 715                                 fh->identification = frag_id;
 716                                 ipv6_hdr(frag)->payload_len =
 717                                                 htons(frag->len -
 718                                                       sizeof(struct ipv6hdr));
 719                                 ip6_copy_metadata(frag, skb);
 720                         }
 721
 722                         err = output(skb);
 723                         if(!err)
 724                                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
 725
 726                         if (err || !frag)
 727                                 break;
 728
 729                         skb = frag;
 730                         frag = skb->next;
 731                         skb->next = NULL;
 732                 }
 733
 734                 kfree(tmp_hdr);
 735
 736                 if (err == 0) {
 737                         IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
 738                         dst_release(&rt->u.dst);
 739                         return 0;
 740                 }
 741
 742                 while (frag) {
 743                         skb = frag->next;
 744                         kfree_skb(frag);
 745                         frag = skb;
 746                 }
 747
 748                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
 749                 dst_release(&rt->u.dst);
 750                 return err;
 751         }
 752
 753 slow_path:
 754         left = skb->len - hlen;         /* Space per frame */
 755         ptr = hlen;                     /* Where to start from */
 756
 757         /*
 758          *      Fragment the datagram.
 759          */
 760
 761         *prevhdr = NEXTHDR_FRAGMENT;
 762
 763         /*
 764          *      Keep copying data until we run out.
 765          */
 766         while(left > 0) {
 767                 len = left;
 768                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 769                 if (len > mtu)
 770                         len = mtu;
 771                 /* IF: we are not sending upto and including the packet end
 772                    then align the next start on an eight byte boundary */
 773                 if (len < left) {
 774                         len &= ~7;
 775                 }
 776                 /*
 777                  *      Allocate buffer.
 778                  */
 779
 780                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 781                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 782                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 783                                       IPSTATS_MIB_FRAGFAILS);
 784                         err = -ENOMEM;
 785                         goto fail;
 786                 }
 787
 788                 /*
 789                  *      Set up data on packet
 790                  */
 791
 792                 ip6_copy_metadata(frag, skb);
 793                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 794                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 795                 skb_reset_network_header(frag);
 796                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 797                 frag->transport_header = (frag->network_header + hlen +
 798                                           sizeof(struct frag_hdr));
 799
 800                 /*
 801                  *      Charge the memory for the fragment to any owner
 802                  *      it might possess
 803                  */
 804                 if (skb->sk)
 805                         skb_set_owner_w(frag, skb->sk);
 806
 807                 /*
 808                  *      Copy the packet header into the new buffer.
 809                  */
 810                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 811
 812                 /*
 813                  *      Build fragment header.
 814                  */
 815                 fh->nexthdr = nexthdr;
 816                 fh->reserved = 0;
 817                 if (!frag_id) {
 818                         ipv6_select_ident(skb, fh);
 819                         frag_id = fh->identification;
 820                 } else
 821                         fh->identification = frag_id;
 822
 823                 /*
 824                  *      Copy a block of the IP datagram.
 825                  */
 826                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 827                         BUG();
 828                 left -= len;
 829
 830                 fh->frag_off = htons(offset);
 831                 if (left > 0)
 832                         fh->frag_off |= htons(IP6_MF);
 833                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 834                                                     sizeof(struct ipv6hdr));
 835
 836                 ptr += len;
 837                 offset += len;
 838
 839                 /*
 840                  *      Put this fragment into the sending queue.
 841                  */
 842                 err = output(frag);
 843                 if (err)
 844                         goto fail;
 845
 846                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
 847         }
 848         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 849                       IPSTATS_MIB_FRAGOKS);
 850         kfree_skb(skb);
 851         return err;
 852
 853 fail:
 854         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 855                       IPSTATS_MIB_FRAGFAILS);
 856         kfree_skb(skb);
 857         return err;
 858 }
 859
 860 static inline int ip6_rt_check(struct rt6key *rt_key,
 861                                struct in6_addr *fl_addr,
 862                                struct in6_addr *addr_cache)
 863 {
 864         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 865                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 866 }
 867
 868 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 869                                           struct dst_entry *dst,
 870                                           struct flowi *fl)
 871 {
 872         struct ipv6_pinfo *np = inet6_sk(sk);
 873         struct rt6_info *rt = (struct rt6_info *)dst;
 874
 875         if (!dst)
 876                 goto out;
 877
 878         /* Yes, checking route validity in not connected
 879          * case is not very simple. Take into account,
 880          * that we do not support routing by source, TOS,
 881          * and MSG_DONTROUTE            --ANK (980726)
 882          *
 883          * 1. ip6_rt_check(): If route was host route,
 884          *    check that cached destination is current.
 885          *    If it is network route, we still may
 886          *    check its validity using saved pointer
 887          *    to the last used address: daddr_cache.
 888          *    We do not want to save whole address now,
 889          *    (because main consumer of this service
 890          *    is tcp, which has not this problem),
 891          *    so that the last trick works only on connected
 892          *    sockets.
 893          * 2. oif also should be the same.
 894          */
 895         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 896 #ifdef CONFIG_IPV6_SUBTREES
 897             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 898 #endif
 899             (fl->oif && fl->oif != dst->dev->ifindex)) {
 900                 dst_release(dst);
 901                 dst = NULL;
 902         }
 903
 904 out:
 905         return dst;
 906 }
 907
 908 static int ip6_dst_lookup_tail(struct sock *sk,
 909                                struct dst_entry **dst, struct flowi *fl)
 910 {
 911         int err;
 912
 913         if (*dst == NULL)
 914                 *dst = ip6_route_output(sk, fl);
 915
 916         if ((err = (*dst)->error))
 917                 goto out_err_release;
 918
 919         if (ipv6_addr_any(&fl->fl6_src)) {
 920                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
 921                 if (err)
 922                         goto out_err_release;
 923         }
 924
 925 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 926                 /*
 927                  * Here if the dst entry we've looked up
 928                  * has a neighbour entry that is in the INCOMPLETE
 929                  * state and the src address from the flow is
 930                  * marked as OPTIMISTIC, we release the found
 931                  * dst entry and replace it instead with the
 932                  * dst entry of the nexthop router
 933                  */
 934                 if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
 935                         struct inet6_ifaddr *ifp;
 936                         struct flowi fl_gw;
 937                         int redirect;
 938
 939                         ifp = ipv6_get_ifaddr(&init_net, &fl->fl6_src,
 940                                               (*dst)->dev, 1);
 941
 942                         redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 943                         if (ifp)
 944                                 in6_ifa_put(ifp);
 945
 946                         if (redirect) {
 947                                 /*
 948                                  * We need to get the dst entry for the
 949                                  * default router instead
 950                                  */
 951                                 dst_release(*dst);
 952                                 memcpy(&fl_gw, fl, sizeof(struct flowi));
 953                                 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 954                                 *dst = ip6_route_output(sk, &fl_gw);
 955                                 if ((err = (*dst)->error))
 956                                         goto out_err_release;
 957                         }
 958                 }
 959 #endif
 960
 961         return 0;
 962
 963 out_err_release:
 964         if (err == -ENETUNREACH)
 965                 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
 966         dst_release(*dst);
 967         *dst = NULL;
 968         return err;
 969 }
 970
 971 /**
 972  *      ip6_dst_lookup - perform route lookup on flow
 973  *      @sk: socket which provides route info
 974  *      @dst: pointer to dst_entry * for result
 975  *      @fl: flow to lookup
 976  *
 977  *      This function performs a route lookup on the given flow.
 978  *
 979  *      It returns zero on success, or a standard errno code on error.
 980  */
 981 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 982 {
 983         *dst = NULL;
 984         return ip6_dst_lookup_tail(sk, dst, fl);
 985 }
 986 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
 987
 988 /**
 989  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
 990  *      @sk: socket which provides the dst cache and route info
 991  *      @dst: pointer to dst_entry * for result
 992  *      @fl: flow to lookup
 993  *
 994  *      This function performs a route lookup on the given flow with the
 995  *      possibility of using the cached route in the socket if it is valid.
 996  *      It will take the socket dst lock when operating on the dst cache.
 997  *      As a result, this function can only be used in process context.
 998  *
 999  *      It returns zero on success, or a standard errno code on error.
1000  */
1001 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1002 {
1003         *dst = NULL;
1004         if (sk) {
1005                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1006                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1007         }
1008
1009         return ip6_dst_lookup_tail(sk, dst, fl);
1010 }
1011 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1012
1013 static inline int ip6_ufo_append_data(struct sock *sk,
1014                         int getfrag(void *from, char *to, int offset, int len,
1015                         int odd, struct sk_buff *skb),
1016                         void *from, int length, int hh_len, int fragheaderlen,
1017                         int transhdrlen, int mtu,unsigned int flags)
1018
1019 {
1020         struct sk_buff *skb;
1021         int err;
1022
1023         /* There is support for UDP large send offload by network
1024          * device, so create one single skb packet containing complete
1025          * udp datagram
1026          */
1027         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1028                 skb = sock_alloc_send_skb(sk,
1029                         hh_len + fragheaderlen + transhdrlen + 20,
1030                         (flags & MSG_DONTWAIT), &err);
1031                 if (skb == NULL)
1032                         return -ENOMEM;
1033
1034                 /* reserve space for Hardware header */
1035                 skb_reserve(skb, hh_len);
1036
1037                 /* create space for UDP/IP header */
1038                 skb_put(skb,fragheaderlen + transhdrlen);
1039
1040                 /* initialize network header pointer */
1041                 skb_reset_network_header(skb);
1042
1043                 /* initialize protocol header pointer */
1044                 skb->transport_header = skb->network_header + fragheaderlen;
1045
1046                 skb->ip_summed = CHECKSUM_PARTIAL;
1047                 skb->csum = 0;
1048                 sk->sk_sndmsg_off = 0;
1049         }
1050
1051         err = skb_append_datato_frags(sk,skb, getfrag, from,
1052                                       (length - transhdrlen));
1053         if (!err) {
1054                 struct frag_hdr fhdr;
1055
1056                 /* specify the length of each IP datagram fragment*/
1057                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1058                                             sizeof(struct frag_hdr);
1059                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1060                 ipv6_select_ident(skb, &fhdr);
1061                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1062                 __skb_queue_tail(&sk->sk_write_queue, skb);
1063
1064                 return 0;
1065         }
1066         /* There is not enough support do UPD LSO,
1067          * so follow normal path
1068          */
1069         kfree_skb(skb);
1070
1071         return err;
1072 }
1073
1074 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1075         int offset, int len, int odd, struct sk_buff *skb),
1076         void *from, int length, int transhdrlen,
1077         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1078         struct rt6_info *rt, unsigned int flags)
1079 {
1080         struct inet_sock *inet = inet_sk(sk);
1081         struct ipv6_pinfo *np = inet6_sk(sk);
1082         struct sk_buff *skb;
1083         unsigned int maxfraglen, fragheaderlen;
1084         int exthdrlen;
1085         int hh_len;
1086         int mtu;
1087         int copy;
1088         int err;
1089         int offset = 0;
1090         int csummode = CHECKSUM_NONE;
1091
1092         if (flags&MSG_PROBE)
1093                 return 0;
1094         if (skb_queue_empty(&sk->sk_write_queue)) {
1095                 /*
1096                  * setup for corking
1097                  */
1098                 if (opt) {
1099                         if (np->cork.opt == NULL) {
1100                                 np->cork.opt = kmalloc(opt->tot_len,
1101                                                        sk->sk_allocation);
1102                                 if (unlikely(np->cork.opt == NULL))
1103                                         return -ENOBUFS;
1104                         } else if (np->cork.opt->tot_len < opt->tot_len) {
1105                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1106                                 return -EINVAL;
1107                         }
1108                         memcpy(np->cork.opt, opt, opt->tot_len);
1109                         inet->cork.flags |= IPCORK_OPT;
1110                         /* need source address above miyazawa*/
1111                 }
1112                 dst_hold(&rt->u.dst);
1113                 np->cork.rt = rt;
1114                 inet->cork.fl = *fl;
1115                 np->cork.hop_limit = hlimit;
1116                 np->cork.tclass = tclass;
1117                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1118                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1119                 if (np->frag_size < mtu) {
1120                         if (np->frag_size)
1121                                 mtu = np->frag_size;
1122                 }
1123                 inet->cork.fragsize = mtu;
1124                 if (dst_allfrag(rt->u.dst.path))
1125                         inet->cork.flags |= IPCORK_ALLFRAG;
1126                 inet->cork.length = 0;
1127                 sk->sk_sndmsg_page = NULL;
1128                 sk->sk_sndmsg_off = 0;
1129                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1130                             rt->rt6i_nfheader_len;
1131                 length += exthdrlen;
1132                 transhdrlen += exthdrlen;
1133         } else {
1134                 rt = np->cork.rt;
1135                 fl = &inet->cork.fl;
1136                 if (inet->cork.flags & IPCORK_OPT)
1137                         opt = np->cork.opt;
1138                 transhdrlen = 0;
1139                 exthdrlen = 0;
1140                 mtu = inet->cork.fragsize;
1141         }
1142
1143         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1144
1145         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1146                         (opt ? opt->opt_nflen : 0);
1147         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1148
1149         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1150                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1151                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1152                         return -EMSGSIZE;
1153                 }
1154         }
1155
1156         /*
1157          * Let's try using as much space as possible.
1158          * Use MTU if total length of the message fits into the MTU.
1159          * Otherwise, we need to reserve fragment header and
1160          * fragment alignment (= 8-15 octects, in total).
1161          *
1162          * Note that we may need to "move" the data from the tail of
1163          * of the buffer to the new fragment when we split
1164          * the message.
1165          *
1166          * FIXME: It may be fragmented into multiple chunks
1167          *        at once if non-fragmentable extension headers
1168          *        are too large.
1169          * --yoshfuji
1170          */
1171
1172         inet->cork.length += length;
1173         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1174             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1175
1176                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1177                                           fragheaderlen, transhdrlen, mtu,
1178                                           flags);
1179                 if (err)
1180                         goto error;
1181                 return 0;
1182         }
1183
1184         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1185                 goto alloc_new_skb;
1186
1187         while (length > 0) {
1188                 /* Check if the remaining data fits into current packet. */
1189                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1190                 if (copy < length)
1191                         copy = maxfraglen - skb->len;
1192
1193                 if (copy <= 0) {
1194                         char *data;
1195                         unsigned int datalen;
1196                         unsigned int fraglen;
1197                         unsigned int fraggap;
1198                         unsigned int alloclen;
1199                         struct sk_buff *skb_prev;
1200 alloc_new_skb:
1201                         skb_prev = skb;
1202
1203                         /* There's no room in the current skb */
1204                         if (skb_prev)
1205                                 fraggap = skb_prev->len - maxfraglen;
1206                         else
1207                                 fraggap = 0;
1208
1209                         /*
1210                          * If remaining data exceeds the mtu,
1211                          * we know we need more fragment(s).
1212                          */
1213                         datalen = length + fraggap;
1214                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1215                                 datalen = maxfraglen - fragheaderlen;
1216
1217                         fraglen = datalen + fragheaderlen;
1218                         if ((flags & MSG_MORE) &&
1219                             !(rt->u.dst.dev->features&NETIF_F_SG))
1220                                 alloclen = mtu;
1221                         else
1222                                 alloclen = datalen + fragheaderlen;
1223
1224                         /*
1225                          * The last fragment gets additional space at tail.
1226                          * Note: we overallocate on fragments with MSG_MODE
1227                          * because we have no idea if we're the last one.
1228                          */
1229                         if (datalen == length + fraggap)
1230                                 alloclen += rt->u.dst.trailer_len;
1231
1232                         /*
1233                          * We just reserve space for fragment header.
1234                          * Note: this may be overallocation if the message
1235                          * (without MSG_MORE) fits into the MTU.
1236                          */
1237                         alloclen += sizeof(struct frag_hdr);
1238
1239                         if (transhdrlen) {
1240                                 skb = sock_alloc_send_skb(sk,
1241                                                 alloclen + hh_len,
1242                                                 (flags & MSG_DONTWAIT), &err);
1243                         } else {
1244                                 skb = NULL;
1245                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1246                                     2 * sk->sk_sndbuf)
1247                                         skb = sock_wmalloc(sk,
1248                                                            alloclen + hh_len, 1,
1249                                                            sk->sk_allocation);
1250                                 if (unlikely(skb == NULL))
1251                                         err = -ENOBUFS;
1252                         }
1253                         if (skb == NULL)
1254                                 goto error;
1255                         /*
1256                          *      Fill in the control structures
1257                          */
1258                         skb->ip_summed = csummode;
1259                         skb->csum = 0;
1260                         /* reserve for fragmentation */
1261                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1262
1263                         /*
1264                          *      Find where to start putting bytes
1265                          */
1266                         data = skb_put(skb, fraglen);
1267                         skb_set_network_header(skb, exthdrlen);
1268                         data += fragheaderlen;
1269                         skb->transport_header = (skb->network_header +
1270                                                  fragheaderlen);
1271                         if (fraggap) {
1272                                 skb->csum = skb_copy_and_csum_bits(
1273                                         skb_prev, maxfraglen,
1274                                         data + transhdrlen, fraggap, 0);
1275                                 skb_prev->csum = csum_sub(skb_prev->csum,
1276                                                           skb->csum);
1277                                 data += fraggap;
1278                                 pskb_trim_unique(skb_prev, maxfraglen);
1279                         }
1280                         copy = datalen - transhdrlen - fraggap;
1281                         if (copy < 0) {
1282                                 err = -EINVAL;
1283                                 kfree_skb(skb);
1284                                 goto error;
1285                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1286                                 err = -EFAULT;
1287                                 kfree_skb(skb);
1288                                 goto error;
1289                         }
1290
1291                         offset += copy;
1292                         length -= datalen - fraggap;
1293                         transhdrlen = 0;
1294                         exthdrlen = 0;
1295                         csummode = CHECKSUM_NONE;
1296
1297                         /*
1298                          * Put the packet on the pending queue
1299                          */
1300                         __skb_queue_tail(&sk->sk_write_queue, skb);
1301                         continue;
1302                 }
1303
1304                 if (copy > length)
1305                         copy = length;
1306
1307                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1308                         unsigned int off;
1309
1310                         off = skb->len;
1311                         if (getfrag(from, skb_put(skb, copy),
1312                                                 offset, copy, off, skb) < 0) {
1313                                 __skb_trim(skb, off);
1314                                 err = -EFAULT;
1315                                 goto error;
1316                         }
1317                 } else {
1318                         int i = skb_shinfo(skb)->nr_frags;
1319                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1320                         struct page *page = sk->sk_sndmsg_page;
1321                         int off = sk->sk_sndmsg_off;
1322                         unsigned int left;
1323
1324                         if (page && (left = PAGE_SIZE - off) > 0) {
1325                                 if (copy >= left)
1326                                         copy = left;
1327                                 if (page != frag->page) {
1328                                         if (i == MAX_SKB_FRAGS) {
1329                                                 err = -EMSGSIZE;
1330                                                 goto error;
1331                                         }
1332                                         get_page(page);
1333                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1334                                         frag = &skb_shinfo(skb)->frags[i];
1335                                 }
1336                         } else if(i < MAX_SKB_FRAGS) {
1337                                 if (copy > PAGE_SIZE)
1338                                         copy = PAGE_SIZE;
1339                                 page = alloc_pages(sk->sk_allocation, 0);
1340                                 if (page == NULL) {
1341                                         err = -ENOMEM;
1342                                         goto error;
1343                                 }
1344                                 sk->sk_sndmsg_page = page;
1345                                 sk->sk_sndmsg_off = 0;
1346
1347                                 skb_fill_page_desc(skb, i, page, 0, 0);
1348                                 frag = &skb_shinfo(skb)->frags[i];
1349                         } else {
1350                                 err = -EMSGSIZE;
1351                                 goto error;
1352                         }
1353                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1354                                 err = -EFAULT;
1355                                 goto error;
1356                         }
1357                         sk->sk_sndmsg_off += copy;
1358                         frag->size += copy;
1359                         skb->len += copy;
1360                         skb->data_len += copy;
1361                         skb->truesize += copy;
1362                         atomic_add(copy, &sk->sk_wmem_alloc);
1363                 }
1364                 offset += copy;
1365                 length -= copy;
1366         }
1367         return 0;
1368 error:
1369         inet->cork.length -= length;
1370         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1371         return err;
1372 }
1373
1374 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1375 {
1376         inet->cork.flags &= ~IPCORK_OPT;
1377         kfree(np->cork.opt);
1378         np->cork.opt = NULL;
1379         if (np->cork.rt) {
1380                 dst_release(&np->cork.rt->u.dst);
1381                 np->cork.rt = NULL;
1382                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1383         }
1384         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1385 }
1386
1387 int ip6_push_pending_frames(struct sock *sk)
1388 {
1389         struct sk_buff *skb, *tmp_skb;
1390         struct sk_buff **tail_skb;
1391         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1392         struct inet_sock *inet = inet_sk(sk);
1393         struct ipv6_pinfo *np = inet6_sk(sk);
1394         struct ipv6hdr *hdr;
1395         struct ipv6_txoptions *opt = np->cork.opt;
1396         struct rt6_info *rt = np->cork.rt;
1397         struct flowi *fl = &inet->cork.fl;
1398         unsigned char proto = fl->proto;
1399         int err = 0;
1400
1401         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1402                 goto out;
1403         tail_skb = &(skb_shinfo(skb)->frag_list);
1404
1405         /* move skb->data to ip header from ext header */
1406         if (skb->data < skb_network_header(skb))
1407                 __skb_pull(skb, skb_network_offset(skb));
1408         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1409                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1410                 *tail_skb = tmp_skb;
1411                 tail_skb = &(tmp_skb->next);
1412                 skb->len += tmp_skb->len;
1413                 skb->data_len += tmp_skb->len;
1414                 skb->truesize += tmp_skb->truesize;
1415                 __sock_put(tmp_skb->sk);
1416                 tmp_skb->destructor = NULL;
1417                 tmp_skb->sk = NULL;
1418         }
1419
1420         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1421         __skb_pull(skb, skb_network_header_len(skb));
1422         if (opt && opt->opt_flen)
1423                 ipv6_push_frag_opts(skb, opt, &proto);
1424         if (opt && opt->opt_nflen)
1425                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1426
1427         skb_push(skb, sizeof(struct ipv6hdr));
1428         skb_reset_network_header(skb);
1429         hdr = ipv6_hdr(skb);
1430
1431         *(__be32*)hdr = fl->fl6_flowlabel |
1432                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1433
1434         hdr->hop_limit = np->cork.hop_limit;
1435         hdr->nexthdr = proto;
1436         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1437         ipv6_addr_copy(&hdr->daddr, final_dst);
1438
1439         skb->priority = sk->sk_priority;
1440
1441         skb->dst = dst_clone(&rt->u.dst);
1442         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1443         if (proto == IPPROTO_ICMPV6) {
1444                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1445
1446                 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1447                 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1448         }
1449
1450         err = ip6_local_out(skb);
1451         if (err) {
1452                 if (err > 0)
1453                         err = np->recverr ? net_xmit_errno(err) : 0;
1454                 if (err)
1455                         goto error;
1456         }
1457
1458 out:
1459         ip6_cork_release(inet, np);
1460         return err;
1461 error:
1462         goto out;
1463 }
1464
1465 void ip6_flush_pending_frames(struct sock *sk)
1466 {
1467         struct sk_buff *skb;
1468
1469         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1470                 if (skb->dst)
1471                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
1472                                       IPSTATS_MIB_OUTDISCARDS);
1473                 kfree_skb(skb);
1474         }
1475
1476         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1477 }